Merge branch 'release-4-6'
authorTeemu Murtola <teemu.murtola@gmail.com>
Fri, 24 May 2013 17:59:13 +0000 (20:59 +0300)
committerTeemu Murtola <teemu.murtola@gmail.com>
Fri, 24 May 2013 17:59:13 +0000 (20:59 +0300)
Omitted the patch with the hacks for silencing warnings with gcc-4.8,
since we plan to do a proper job of that in master branch.

Some minor conflicts resolved.

Conflicts:
CMakeLists.txt
cmake/ThreadMPI.cmake
src/config.h.cmakein
src/programs/mdrun/md.c

Change-Id: I2c1f1b9b40100c269eea6b06b7b073491b5e17d6

166 files changed:
1  2 
CMakeLists.txt
cmake/ThreadMPI.cmake
src/config.h.cmakein
src/gromacs/gmxana/gmx_genion.c
src/gromacs/gmxlib/gmx_cpuid.c
src/gromacs/gmxlib/nonbonded/CMakeLists.txt
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h
src/gromacs/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_template_sparc64_hpc_ace_double.pre
src/gromacs/gmxlib/nonbonded/nonbonded.c
src/gromacs/gmxlib/thread_mpi/CMakeLists.txt
src/gromacs/gmxlib/thread_mpi/atomic.c
src/gromacs/gmxlib/thread_mpi/barrier.c
src/gromacs/gmxlib/thread_mpi/bcast.c
src/gromacs/gmxlib/thread_mpi/collective.c
src/gromacs/gmxlib/thread_mpi/collective.h
src/gromacs/gmxlib/thread_mpi/comm.c
src/gromacs/gmxlib/thread_mpi/errhandler.c
src/gromacs/gmxlib/thread_mpi/gather.c
src/gromacs/gmxlib/thread_mpi/impl.h
src/gromacs/gmxlib/thread_mpi/lock.c
src/gromacs/gmxlib/thread_mpi/once.c
src/gromacs/gmxlib/thread_mpi/p2p_protocol.c
src/gromacs/gmxlib/thread_mpi/p2p_send_recv.c
src/gromacs/gmxlib/thread_mpi/profile.c
src/gromacs/gmxlib/thread_mpi/profile.h
src/gromacs/gmxlib/thread_mpi/pthreads.c
src/gromacs/gmxlib/thread_mpi/scatter.c
src/gromacs/gmxlib/thread_mpi/settings.h
src/gromacs/gmxlib/thread_mpi/system_error.cpp
src/gromacs/gmxlib/thread_mpi/tmpi_init.c
src/gromacs/gmxlib/thread_mpi/tmpi_malloc.c
src/gromacs/gmxlib/thread_mpi/winthreads.c
src/gromacs/gmxlib/tpxio.c
src/gromacs/gmxpreprocess/readir.c
src/gromacs/legacyheaders/gmx_cpuid.h
src/gromacs/legacyheaders/thread_mpi/atomic.h
src/gromacs/legacyheaders/thread_mpi/atomic/cycles.h
src/gromacs/legacyheaders/thread_mpi/atomic/derived.h
src/gromacs/legacyheaders/thread_mpi/atomic/fujitsu_sparc64.h
src/gromacs/legacyheaders/thread_mpi/atomic/gcc.h
src/gromacs/legacyheaders/thread_mpi/atomic/gcc_ia64.h
src/gromacs/legacyheaders/thread_mpi/atomic/gcc_intrinsics.h
src/gromacs/legacyheaders/thread_mpi/atomic/gcc_ppc.h
src/gromacs/legacyheaders/thread_mpi/atomic/gcc_spinlock.h
src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h
src/gromacs/legacyheaders/thread_mpi/atomic/msvc.h
src/gromacs/legacyheaders/thread_mpi/atomic/suncc-sparc.h
src/gromacs/legacyheaders/thread_mpi/atomic/xlc_ppc.h
src/gromacs/legacyheaders/thread_mpi/event.h
src/gromacs/legacyheaders/thread_mpi/lock.h
src/gromacs/legacyheaders/thread_mpi/threads.h
src/gromacs/legacyheaders/thread_mpi/tmpi.h
src/gromacs/legacyheaders/types/idef.h
src/gromacs/mdlib/force.c
src/gromacs/mdlib/minimize.c
src/programs/mdrun/md.c

diff --combined CMakeLists.txt
index 8db354047af569599ce12cf9bc8a4a265c94d662,a62a3a4efa9d05f439683306429bb65ee91ddb29..ede6b0ee511ef8d7702c23d443caa572680b6bef
@@@ -1,19 -1,53 +1,19 @@@
 -#
 -# This file is part of the GROMACS molecular simulation package.
 -#
 -# Copyright (c) 2012,2013, by the GROMACS development team, led by
 -# David van der Spoel, Berk Hess, Erik Lindahl, and including many
 -# others, as listed in the AUTHORS file in the top-level source
 -# directory and at http://www.gromacs.org.
 -#
 -# GROMACS is free software; you can redistribute it and/or
 -# modify it under the terms of the GNU Lesser General Public License
 -# as published by the Free Software Foundation; either version 2.1
 -# of the License, or (at your option) any later version.
 -#
 -# GROMACS is distributed in the hope that it will be useful,
 -# but WITHOUT ANY WARRANTY; without even the implied warranty of
 -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 -# Lesser General Public License for more details.
 -#
 -# You should have received a copy of the GNU Lesser General Public
 -# License along with GROMACS; if not, see
 -# http://www.gnu.org/licenses, or write to the Free Software Foundation,
 -# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 -#
 -# If you want to redistribute modifications to GROMACS, please
 -# consider that scientific software is very special. Version
 -# control is crucial - bugs must be traceable. We will be happy to
 -# consider code for inclusion in the official distribution, but
 -# derived work must not be called official GROMACS. Details are found
 -# in the README & COPYING files - if they are missing, get the
 -# official version at http://www.gromacs.org.
 -#
 -# To help us fund GROMACS development, we humbly ask that you cite
 -# the research papers on the package. Check out http://www.gromacs.org.
 -#
  cmake_minimum_required(VERSION 2.8)
  # Keep CMake suitably quiet on Cygwin
  set(CMAKE_LEGACY_CYGWIN_WIN32 0) # Remove when CMake >= 2.8.4 is required
  
 -# Allows CPack to act differently for normal tools and mdrun (e.g. because of MPI)
 -set(CPACK_COMPONENT_GROUP_TOOLS_DESCRIPTION "All GROMACS executable tools")
 -set(CPACK_COMPONENT_GROUP_MDRUN_DESCRIPTION "GROMACS executable for running simulations")
 -
  # CMake modules/macros are in a subdirectory to keep this file cleaner
  # This needs to be set before project() in order to pick up toolchain files
  list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Platform)
  
 -project(Gromacs C)
 +project(Gromacs)
  include(Dart)
  mark_as_advanced(DART_ROOT)
  
 +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 +
  # PROJECT_VERSION should have the following structure: 
  # VERSION-dev[-SUFFIX] where the VERSION should have the for: vMajor.vMinor.vPatch
  #
@@@ -22,7 -56,7 +22,7 @@@
  # machine with no git. 
  #
  # NOTE: when releasing the "-dev" suffix needs to be stripped off!
 -set(PROJECT_VERSION "4.6.2-dev")
 +set(PROJECT_VERSION "5.0-dev")
  # The version number of the regressiontest tarball against which this
  # git branch can be tested. Normally, this will be the version of the
  # last patch release. Comment the next line out for branches leading
@@@ -36,9 -70,9 +36,9 @@@ if (CUSTOM_VERSION_STRING
  endif (CUSTOM_VERSION_STRING)
  set(SOVERSION 8)
  # It is a bit irritating, but this has to be set separately for now!
 -SET(CPACK_PACKAGE_VERSION_MAJOR "4")
 -SET(CPACK_PACKAGE_VERSION_MINOR "6")
 -SET(CPACK_PACKAGE_VERSION_PATCH "1")
 +SET(CPACK_PACKAGE_VERSION_MAJOR "5")
 +SET(CPACK_PACKAGE_VERSION_MINOR "0")
 +#SET(CPACK_PACKAGE_VERSION_PATCH "0")
  
  # The numerical gromacs version. It is 40600 for 4.6.0.
  # The #define GMX_VERSION in gmx_header_config_h is set to this value.
@@@ -57,12 -91,9 +57,12 @@@ endif(
  set(API_VERSION ${NUM_VERSION})
  
  if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND UNIX)
 -set(CMAKE_INSTALL_PREFIX "/usr/local/gromacs" CACHE STRING "Installation prefix (installation will need write permissions here)" FORCE)
 +    set(CMAKE_INSTALL_PREFIX "/usr/local/gromacs" CACHE STRING "Installation prefix (installation will need write permissions here)" FORCE)
  endif()
  
 +set(GMX_INSTALL_PREFIX "" CACHE STRING "Prefix gets appended to CMAKE_INSTALL_PREFIX. For cpack it sets the root folder of the archive.")
 +mark_as_advanced(GMX_INSTALL_PREFIX)
 +
  include(gmxBuildTypeReference)
  
  if(NOT CMAKE_BUILD_TYPE)
  endif(NOT CMAKE_BUILD_TYPE)
  
  enable_language(C)
 -
 -set(GMX_USE_RELATIVE_INSTALL_PATH OFF CACHE STRING "Use relative paths not absolute paths for cmake install. Has only an effect on cpack.")
 -mark_as_advanced(GMX_USE_RELATIVE_INSTALL_PATH)
 +enable_language(CXX)
  
  set(CPACK_PACKAGE_VERSION ${PROJECT_VERSION})
  set(CPACK_PACKAGE_VENDOR "gromacs.org")
  set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Gromacs - a toolkit for high-performance molecular simulation")
 -if (NOT GMX_USE_RELATIVE_INSTALL_PATH)
 -    set(CPACK_SET_DESTDIR "ON")
 -endif()
  set(CPACK_RESOURCE_FILE_WELCOME "${CMAKE_SOURCE_DIR}/admin/InstallWelcome.txt")
  # Its GPL/LGPL, so they do not have to agree to a license for mere usage, but some installers require this...
  set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/COPYING")
@@@ -87,11 -123,6 +87,11 @@@ set(CPACK_PACKAGE_CONTACT "gmx-users@gr
  #must come after all cpack settings!
  include(CPack)
  
 +set(SOURCE_IS_GIT_REPOSITORY OFF)
 +if(EXISTS "${CMAKE_SOURCE_DIR}/.git")
 +    set(SOURCE_IS_GIT_REPOSITORY ON)
 +endif()
 +
  ########################################################################
  # Check and warn if cache generated on a different host is being reused
  ########################################################################
@@@ -110,10 -141,24 +110,10 @@@ if(CMAKE_HOST_UNIX
  endif()
  
  ########################################################################
 -# User input options - enable C++ - before any CXX flags are changed   #
 -########################################################################
 -
 -# decide on GPU settings based on user-settings and GPU/CUDA detection
 -include(gmxManageGPU)
 -
 -option(GMX_FORCE_CXX "Enable C++ compilation even if not necessary" OFF)
 -mark_as_advanced(GMX_FORCE_CXX)
 -
  option(GMX_COOL_QUOTES "Enable Gromacs cool quotes" ON)
  mark_as_advanced(GMX_COOL_QUOTES)
  
 -if(GMX_GPU OR GMX_FORCE_CXX OR GMX_OPENMM)
 -    enable_language(CXX)
 -endif()
  set(CMAKE_PREFIX_PATH "" CACHE STRING "Extra locations to search for external libraries and tools (give directory without lib, bin, or include)")
 -
 -########################################################################
  # User input options                                                   #
  ########################################################################
  option(GMX_DOUBLE "Use double precision (much slower, use only if you really need it)" OFF)
@@@ -124,13 -169,6 +124,13 @@@ mark_as_advanced(GMX_SOFTWARE_INVSQRT
  option(GMX_FAHCORE "Build a library with mdrun functionality" OFF)
  mark_as_advanced(GMX_FAHCORE)
  
 +# decide on GPU settings based on user-settings and GPU/CUDA detection
 +include(gmxManageGPU)
 +
 +# TODO: move OpenMM to contrib
 +option(GMX_OPENMM "Accelerated execution on GPUs through the OpenMM library (rerun cmake after changing to see relevant options)" OFF)
 +mark_as_advanced(GMX_OPENMM)
 +
  include(gmxDetectAcceleration)
  if(NOT DEFINED GMX_CPU_ACCELERATION)
      if(CMAKE_CROSSCOMPILING)
  endif(NOT DEFINED GMX_CPU_ACCELERATION)
  
  set(GMX_CPU_ACCELERATION "@GMX_SUGGESTED_CPU_ACCELERATION@"
-     CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX")
+     CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX, Sparc64_HPC_ACE")
  
  set(GMX_FFT_LIBRARY "fftw3" 
      CACHE STRING "FFT library choices: fftw3,mkl,fftpack[built-in]")
@@@ -163,16 -201,12 +163,16 @@@ mark_as_advanced(GMX_MPI_IN_PLACE
  option(GMX_LOAD_PLUGINS "Compile with plugin support, needed to read VMD supported file formats" ON)
  mark_as_advanced(GMX_LOAD_PLUGINS)
  
 +option(GMX_GPU  "Enable GPU acceleration" ON)
  option(GMX_OPENMP "Enable OpenMP-based multithreading" ON)
  
 -option(USE_VERSION_H "Generate development version string/information" ON)
 -mark_as_advanced(USE_VERSION_H)
 +option(GMX_GIT_VERSION_INFO "Generate git version information" ${SOURCE_IS_GIT_REPOSITORY})
 +mark_as_advanced(GMX_GIT_VERSION_INFO)
  
  option(GMX_DEFAULT_SUFFIX "Use default suffixes for GROMACS binaries and libs (_d for double, _mpi for MPI; rerun cmake after changing to see relevant options)" ON)
 +if(UNIX)
 +    option(GMX_SYMLINK_OLD_BINARY_NAMES "Create symbolic links for pre-5.0 binary names" ON)
 +endif()
  
  if(UNIX)
      option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static archives (not available on non-*nix platforms and it will only work if static versions of external dependencies are available and found)!" OFF)
@@@ -196,7 -230,9 +196,7 @@@ mark_as_advanced(GMX_SKIP_DEFAULT_CFLAG
  # These files should be removed from the source tree when a CMake version that
  # includes the features in question becomes required for building GROMACS.
  include(CheckCCompilerFlag)
 -if(CMAKE_CXX_COMPILER_LOADED)
 -    include(CheckCXXCompilerFlag)
 -endif()
 +include(CheckCXXCompilerFlag)
  
  # Get compiler version information, needs to be done early as check that depend
  # on compiler verison follow below.
@@@ -220,7 -256,7 +220,7 @@@ if(GMX_OPENMP
          # CMake on Windows doesn't support linker flags passed to target_link_libraries
          # (i.e. it treats /openmp as \openmp library file). Also, no OpenMP linker flags are needed.
          if(NOT (WIN32 AND NOT CYGWIN))
 -            if(CMAKE_COMPILER_IS_GNUCC AND GMX_PREFER_STATIC_OPENMP)
 +            if(CMAKE_COMPILER_IS_GNUCC AND GMX_PREFER_STATIC_OPENMP AND NOT APPLE)
                  set(OpenMP_LINKER_FLAGS "-Wl,-static -lgomp -lrt -Wl,-Bdynamic -lpthread")
                  set(OpenMP_SHARED_LINKER_FLAGS "")
              else()
@@@ -316,26 -352,38 +316,26 @@@ endif(GMX_SOFTWARE_INVSQRT
  # Basic system tests (standard libraries, headers, functions, types)   #
  ########################################################################
  include(CheckIncludeFiles)
 -check_include_files(string.h     HAVE_STRING_H)
 -check_include_files(math.h       HAVE_MATH_H)
 -check_include_files(limits.h     HAVE_LIMITS_H)
 -check_include_files(memory.h     HAVE_MEMORY_H)
 +include(CheckIncludeFileCXX)
  check_include_files(unistd.h   HAVE_UNISTD_H)
 -check_include_files(direct.h   HAVE_DIRECT_H)
  check_include_files(pwd.h        HAVE_PWD_H)
 -check_include_files(stdint.h   HAVE_STDINT_H)
 -check_include_files(stdlib.h   HAVE_STDLIB_H)
  check_include_files(pthread.h    HAVE_PTHREAD_H)
  check_include_files(dirent.h     HAVE_DIRENT_H)
 -check_include_files(inttypes.h   HAVE_INTTYPES_H)
 -check_include_files(regex.h      HAVE_REGEX_H)
 -check_include_files(sys/types.h  HAVE_SYS_TYPES_H)
 -check_include_files(sys/stat.h   HAVE_SYS_STAT_H)
  check_include_files(sys/time.h   HAVE_SYS_TIME_H)
 -check_include_files(rpc/rpc.h    HAVE_RPC_RPC_H)
 -check_include_files("rpc/rpc.h;rpc/xdr.h"    HAVE_RPC_XDR_H)
  check_include_files(io.h               HAVE_IO_H)
  check_include_files(sched.h      HAVE_SCHED_H)
  
 +check_include_files(regex.h      HAVE_POSIX_REGEX)
 +check_include_file_cxx(regex     HAVE_CXX11_REGEX)
 +# TODO: It could be nice to inform the user if no regex support is found,
 +# as selections won't be fully functional.
 +
  include(CheckFunctionExists)
 -check_function_exists(strcasecmp        HAVE_STRCASECMP)
  check_function_exists(strdup            HAVE_STRDUP)
 -check_function_exists(vprintf           HAVE_VPRINTF)
 -check_function_exists(memcmp            HAVE_MEMCMP)
  check_function_exists(posix_memalign    HAVE_POSIX_MEMALIGN)
  check_function_exists(memalign          HAVE_MEMALIGN)
  check_function_exists(_aligned_malloc   HAVE__ALIGNED_MALLOC)
  check_function_exists(gettimeofday      HAVE_GETTIMEOFDAY)
 -check_function_exists(isnan             HAVE_ISNAN)
 -check_function_exists(_isnan            HAVE__ISNAN)
  check_function_exists(fsync             HAVE_FSYNC)
  check_function_exists(_fileno           HAVE__FILENO)
  check_function_exists(fileno            HAVE_FILENO)
@@@ -355,6 -403,7 +355,6 @@@ check_library_exists(m cbrt "" HAVE_CBR
  
  include(CheckTypeSize)
  
 -check_type_size("bool"          SIZEOF_BOOL) # will also set HAVE_BOOL
  check_type_size("int"           SIZEOF_INT) 
  check_type_size("long int"      SIZEOF_LONG_INT) 
  check_type_size("long long int" SIZEOF_LONG_LONG_INT) 
@@@ -367,6 -416,35 +367,6 @@@ else (CMAKE_C_SIZEOF_DATA_PTR EQUAL 8
      set(GMX_64_BIT FALSE)
  endif (CMAKE_C_SIZEOF_DATA_PTR EQUAL 8)
  
 -# Check for some basic types that we *need*, so set these to int if they are not present 
 -check_type_size(uid_t uid_t)
 -if(NOT uid_t)
 -  set(uid_t int)
 -else(NOT uid_t)
 -  set(uid_t 0)
 -endif(NOT uid_t)
 -
 -check_type_size(gid_t gid_t)
 -if(NOT gid_t)
 -  set(gid_t 1)
 -else(NOT gid_t)
 -  set(gid_t 0)
 -endif(NOT gid_t)
 -
 -check_type_size(size_t size_t)
 -if(NOT size_t)
 -  set(size_t int)
 -else(NOT size_t)
 -  set(size_t 0)
 -endif(NOT size_t)
 -
 -check_type_size(off_t off_t)
 -if(NOT off_t)
 -  set(off_t int)
 -else(NOT off_t)
 -  set(off_t 0)
 -endif(NOT off_t)
 -
  include(TestBigEndian)
  test_big_endian(GMX_INTEGER_BIG_ENDIAN)
  
@@@ -425,17 -503,12 +425,17 @@@ if(UNIX AND GMX_PREFER_STATIC_LIBS
  endif()
  
  IF( WIN32 AND NOT CYGWIN)
 +  # This makes windows.h not declare min/max as macros that would break
 +  # C++ code using std::min/std::max.
 +  add_definitions(-DNOMINMAX)
 +
    if (NOT BUILD_SHARED_LIBS)
        option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static system libraries (MT instead of MD)!" ON)
        if(NOT GMX_PREFER_STATIC_LIBS)
            message(WARNING "Shared system libraries requested, and static Gromacs libraries requested.")
        endif()
    else()
 +      message(FATAL_ERROR "BUILD_SHARED_LIBS not yet working for Windows in the master branch")
        option(GMX_PREFER_STATIC_LIBS "When finding libraries prefer static system libraries (MT instead of MD)!" OFF)
        if(GMX_PREFER_STATIC_LIBS)
            #this combination segfaults (illigal passing of file handles)
    ENDIF()
  ENDIF()
  
 +# Unconditionally find the package, as it is also required for unit tests
 +find_package(LibXml2)
 +option(GMX_XML "Use libxml2 to parse xml files (currently has no effect)" ${LIBXML2_FOUND})
 +set(PKG_XML "")
 +mark_as_advanced(GMX_XML)
 +# Don't actually do anything, since libxml2 is currently not used by libgromacs
 +#if(GMX_XML AND NOT LIBXML2_FOUND)
 +#    message(FATAL_ERROR "libxml2 not found. Set GMX_XML=OFF to compile without XML support")
 +#endif()
 +#if(GMX_XML)
 +#    include_directories(${LIBXML2_INCLUDE_DIR})
 +#    set(PKG_XML libxml-2.0)
 +#    set(XML_LIBRARIES ${LIBXML2_LIBRARIES})
 +#endif(GMX_XML)
 +
  option(GMX_GSL "Add support for gsl" OFF)
  if (GMX_GSL)
    find_package(GSL)
@@@ -500,12 -558,11 +500,14 @@@ endif(GMX_X11
  include(ThreadMPI)
  set(THREAD_MPI_LIB thread_mpi)
  if(GMX_THREAD_MPI)
-     tmpi_get_source_list(THREAD_MPI_SRC CXX)
 -    tmpi_enable()
++    tmpi_enable(CXX)
      set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_THREAD_MPI")
      set(GMX_MPI 1)
-     tmpi_get_source_list(THREAD_MPI_SRC CXX NOMPI)
++    tmpi_get_source_list(THREAD_MPI_SRC)
 +else(GMX_THREAD_MPI)
++    tmpi_enable(CXX NOMPI)
++    tmpi_get_source_list(THREAD_MPI_SRC)
  endif(GMX_THREAD_MPI)
 -tmpi_get_source_list(THREAD_MPI_SRC)
  
  if(GMX_GPU)
      # now that we have detected the dependencies, do the second configure pass
@@@ -526,65 -583,30 +528,65 @@@ if(WIN32 AND NOT CYGWIN
  endif()
  
  # only bother with finding git and using version.h if the source is a git repo
 -if(EXISTS "${CMAKE_SOURCE_DIR}/.git")
 -    if(USE_VERSION_H)
 -        # We need at least git v1.5.3 be able to parse git's date output. If not
 -        # found or the version is too small, we can't generate version information.
 -        find_package(Git)
 -
 -      # Find out the git version
 -      if(GIT_FOUND AND NOT GIT_VERSION)
 -        execute_process(COMMAND ${GIT_EXECUTABLE} "--version"
 -            OUTPUT_VARIABLE _exec_out
 -            OUTPUT_STRIP_TRAILING_WHITESPACE)
 -        string(REGEX REPLACE "git version (.*)" "\\1" GIT_VERSION ${_exec_out})
 -        set(GIT_VERSION ${GIT_VERSION} CACHE STRING "Git version")
 -        mark_as_advanced(GIT_VERSION)
 -      endif()
 -
 -        if(NOT GIT_FOUND OR GIT_VERSION VERSION_LESS "1.5.3")
 -          message("No compatible git version found, won't be able to generate proper development version information.")
 -          set(USE_VERSION_H OFF)
 -        endif()
 +if(GMX_GIT_VERSION_INFO)
 +    if (NOT SOURCE_IS_GIT_REPOSITORY)
 +        message(FATAL_ERROR
 +            "Cannot generate git version information from source tree not under git. "
 +            "Set GMX_GIT_VERSION_INFO=OFF to proceed.")
 +    endif ()
 +    # We need at least git v1.5.3 be able to parse git's date output. If not
 +    # found or the version is too small, we can't generate version information.
 +    find_package(Git)
 +
 +    # Find out the git version
 +    if(GIT_FOUND AND NOT GIT_VERSION)
 +      execute_process(COMMAND ${GIT_EXECUTABLE} "--version"
 +        OUTPUT_VARIABLE _exec_out
 +        OUTPUT_STRIP_TRAILING_WHITESPACE)
 +      string(REGEX REPLACE "git version (.*)" "\\1" GIT_VERSION ${_exec_out})
 +      set(GIT_VERSION ${GIT_VERSION} CACHE STRING "Git version")
 +      mark_as_advanced(GIT_VERSION)
 +    endif()
 +
 +    if(NOT GIT_FOUND OR GIT_VERSION VERSION_LESS "1.5.3")
 +        message(FATAL_ERROR
 +            "No compatible git version found (>= 1.5.3 required). "
 +            "Won't be able to generate development version information. "
 +            "Set GMX_GIT_VERSION_INFO=OFF to proceed.")
 +    endif()
 +endif()
 +
 +# Detect boost unless GMX_EXTERNAL_BOOST is explicitly OFF
 +# Used for default if GMX_EXTERNAL_BOOST is not defined (first CMake pass)
 +if(NOT DEFINED GMX_EXTERNAL_BOOST OR GMX_EXTERNAL_BOOST)
 +    find_package(Boost 1.44.0)
 +    if(Boost_FOUND AND Boost_VERSION VERSION_LESS "104400")
 +        set(Boost_FOUND FALSE)
 +    endif()
 +    # Print the notification only on first run, when determining the default
 +    if(NOT DEFINED GMX_EXTERNAL_BOOST AND NOT Boost_FOUND)
 +        message("Boost >= 1.44 not found. Using minimal internal version. "
 +                "This may cause trouble if you plan on compiling/linking other "
 +                "software that uses Boost against Gromacs.")
      endif()
 -else()
 -    set(USE_VERSION_H OFF)
  endif()
 +option(GMX_EXTERNAL_BOOST "Use external Boost instead of minimal built-in version"
 +       ${Boost_FOUND})
 +if(GMX_EXTERNAL_BOOST AND NOT Boost_FOUND)
 +    message(FATAL_ERROR
 +        "Boost >= 1.44 not found. "
 +        "You can set GMX_EXTERNAL_BOOST=OFF to compile against minimal "
 +        "version of Boost included with Gromacs.")
 +endif()
 +
 +option(GMX_BUILD_UNITTESTS "Build unit tests with BUILD_TESTING (uses Google C++ Testing and Mocking Frameworks, requires libxml2)" ${LIBXML2_FOUND})
 +mark_as_advanced(GMX_BUILD_UNITTESTS)
 +if (GMX_BUILD_UNITTESTS AND NOT LIBXML2_FOUND)
 +    message(FATAL_ERROR
 +        "Cannot build unit tests without libxml2. "
 +        "Either set GMX_BUILD_UNITTESTS=OFF or tell CMake how to find libxml2.")
 +endif()
 +set(MEMORYCHECK_SUPPRESSIONS_FILE ${CMAKE_SOURCE_DIR}/cmake/legacy_and_external.supp)
  
  ########################################################################
  # Generate development version info for cache
  ########################################################################
  
  add_definitions( -DHAVE_CONFIG_H )
 +include_directories(${CMAKE_SOURCE_DIR}/src)
 +# Required for config.h, maybe should only be set in src/CMakeLists.txt
  include_directories(${CMAKE_BINARY_DIR}/src)
 -include_directories(${CMAKE_BINARY_DIR}/include)
 -include_directories(${CMAKE_SOURCE_DIR}/include)
 +# Required for gmx_header_config_gen.h to be found before installation
 +include_directories(${CMAKE_BINARY_DIR}/src/gromacs/utility)
 +# Required for now to make old code compile
 +include_directories(${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders)
  
  include(gmxTestInlineASM)
  gmx_test_inline_asm_gcc_x86(GMX_X86_GCC_INLINE_ASM)
@@@ -640,14 -658,6 +642,14 @@@ gmx_test_isfinite(HAVE_ISFINITE
  gmx_test__isfinite(HAVE__ISFINITE)
  gmx_test__finite(HAVE__FINITE)
  
 +include(gmxTestCXX11)
 +gmx_test_cxx11(GMX_CXX11 GMX_CXX11_FLAGS)
 +if(CXX11_FLAG AND GMX_GPU)
 +    #FIXME: add proper solution for progate all but cxx11 flag
 +    set(CUDA_PROPAGATE_HOST_FLAGS no)
 +    message(WARNING "Please manually add compiler flags to CUDA_NVCC_FLAGS. Automatic propogation temporary not working.")
 +endif()
 +
  include(gmxTestXDR)
  gmx_test_xdr(GMX_SYSTEM_XDR)
  if(NOT GMX_SYSTEM_XDR)
@@@ -677,10 -687,12 +679,10 @@@ elseif(${GMX_CPU_ACCELERATION} STREQUA
          GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" ACCELERATION_C_FLAGS)
      endif(NOT GNU_SSE2_CFLAG AND GMX_NATIVE_WINDOWS)
  
 -    if (CMAKE_CXX_COMPILER_LOADED)
 -        GMX_TEST_CXXFLAG(GNU_SSE2_CXXFLAG "-msse2" ACCELERATION_CXX_FLAGS)
 -        if(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -            GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" ACCELERATION_CXX_FLAGS)
 -        endif(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -    endif()
 +    GMX_TEST_CXXFLAG(GNU_SSE2_CXXFLAG "-msse2" ACCELERATION_CXX_FLAGS)
 +    if(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
 +        GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" ACCELERATION_CXX_FLAGS)
 +    endif(NOT GNU_SSE2_CXXFLAG AND GMX_NATIVE_WINDOWS)
  
      # We dont warn for lacking SSE2 flag support, since that is probably standard today.
  
@@@ -715,18 -727,20 +717,18 @@@ elseif(${GMX_CPU_ACCELERATION} STREQUA
          endif()
      endif(NOT GNU_SSE4_CFLAG AND NOT MSVC_SSE4_CFLAG)
  
 -    if (CMAKE_CXX_COMPILER_LOADED)
 -        GMX_TEST_CXXFLAG(GNU_SSE4_CXXFLAG "-msse4.1" ACCELERATION_CXX_FLAGS)
 -        if (NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -            GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" ACCELERATION_CXX_FLAGS)
 -        endif(NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -        if (NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG) 
 -            message(WARNING "No C++ SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
 -            # Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
 -            # intrinsics when SSE2 support is enabled, so we try that instead.
 -            if (GMX_NATIVE_WINDOWS)
 -                GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" ACCELERATION_CXX_FLAGS)
 -            endif()
 -        endif(NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
 -    endif()
 +    GMX_TEST_CXXFLAG(GNU_SSE4_CXXFLAG "-msse4.1" ACCELERATION_CXX_FLAGS)
 +    if (NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
 +        GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" ACCELERATION_CXX_FLAGS)
 +    endif(NOT GNU_SSE4_CXXFLAG AND GMX_NATIVE_WINDOWS)
 +    if (NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
 +        message(WARNING "No C++ SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
 +        # Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
 +        # intrinsics when SSE2 support is enabled, so we try that instead.
 +        if (GMX_NATIVE_WINDOWS)
 +            GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" ACCELERATION_CXX_FLAGS)
 +        endif()
 +    endif(NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
  
      # This must come after we have added the -msse4.1 flag on some platforms.
      check_include_file(smmintrin.h  HAVE_SMMINTRIN_H ${ACCELERATION_C_FLAGS})
@@@ -758,13 -772,15 +760,13 @@@ elseif(${GMX_CPU_ACCELERATION} STREQUA
          message(WARNING "No C AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
      endif (NOT GNU_AVX_CFLAG AND NOT MSVC_AVX_CFLAG)
  
 -    if (CMAKE_CXX_COMPILER_LOADED)
 -        GMX_TEST_CXXFLAG(GNU_AVX_CXXFLAG "-mavx" ACCELERATION_CXX_FLAGS)
 -        if (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -            GMX_TEST_CXXFLAG(MSVC_AVX_CXXFLAG "/arch:AVX" ACCELERATION_CXX_FLAGS)
 -        endif (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -        if (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
 -            message(WARNING "No C++ AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
 -        endif (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
 -    endif()
 +    GMX_TEST_CXXFLAG(GNU_AVX_CXXFLAG "-mavx" ACCELERATION_CXX_FLAGS)
 +    if (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
 +        GMX_TEST_CXXFLAG(MSVC_AVX_CXXFLAG "/arch:AVX" ACCELERATION_CXX_FLAGS)
 +    endif (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
 +    if (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
 +       message(WARNING "No C++ AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
 +    endif (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
  
      # Set the FMA4 flags (MSVC doesn't require any)
      if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" AND NOT MSVC)
@@@ -858,9 -874,10 +860,10 @@@ elseif(${GMX_CPU_ACCELERATION} STREQUA
      else()
          message(FATAL_ERROR "Cannot compile IBM QPX intrinsics without the XL compiler. If you are compiling for BlueGene/Q, use 'cmake .. -DCMAKE_TOOLCHAIN_FILE=BlueGeneQ-static-XL-C' to set up the tool chain.")
      endif()
+ elseif(${GMX_CPU_ACCELERATION} STREQUAL "SPARC64_HPC_ACE")
+     set(GMX_CPU_ACCELERATION_SPARC64_HPC_ACE 1)
  else(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
-     MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX")
+     MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX, Sparc64_HPC_ACE")
  endif(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
  set(ACCELERATION_QUIETLY TRUE CACHE INTERNAL "")
  
@@@ -912,11 -929,11 +915,11 @@@ if(${GMX_FFT_LIBRARY} STREQUAL "FFTW3"
  
      set(GMX_FFT_FFTW3 1)
  
-     if (NOT ${GMX_CPU_ACCELERATION} STREQUAL "NONE" AND NOT ${FFTW}_HAVE_SIMD) 
+     if ((${GMX_CPU_ACCELERATION} MATCHES "SSE" OR ${GMX_CPU_ACCELERATION} MATCHES "AVX") AND NOT ${FFTW}_HAVE_SIMD)
        message(WARNING "The fftw library found is compiled without SIMD support, which makes it slow. Consider recompiling it or contact your admin")
      endif()
  
-     if(NOT ${GMX_CPU_ACCELERATION} STREQUAL "NONE" AND ${FFTW}_HAVE_AVX)
+     if((${GMX_CPU_ACCELERATION} MATCHES "SSE" OR ${GMX_CPU_ACCELERATION} MATCHES "AVX") AND ${FFTW}_HAVE_AVX)
          # If we're not doing CPU acceleration, we don't care about FFTW performance on x86 either
          message(WARNING "The FFTW library was compiled with --enable-avx to enable AVX SIMD instructions. That might sound like a good idea for your processor, but for FFTW versions up to 3.3.3, these are slower than the SSE/SSE2 SIMD instructions for the way GROMACS uses FFTs. Limitations in the way FFTW allows GROMACS to measure performance make it awkward for either GROMACS or FFTW to make the decision for you based on runtime performance. You should compile a different FFTW library with --enable-sse or --enable-sse2. If you have a more recent FFTW, you may like to compare the performance of GROMACS with FFTW libraries compiled with and without --enable-avx. However, the GROMACS developers do not really expect the FFTW AVX optimization to help, because the performance is limited by memory access, not computation.")
      endif()
@@@ -1087,7 -1104,7 +1090,7 @@@ endif(GMX_FAHCORE
  # these are set after everything else
  if (NOT GMX_SKIP_DEFAULT_CFLAGS)
      set(CMAKE_C_FLAGS "${ACCELERATION_C_FLAGS} ${MPI_COMPILE_FLAGS} ${CMAKE_C_FLAGS}")
 -    set(CMAKE_CXX_FLAGS "${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${CMAKE_CXX_FLAGS}")
 +    set(CMAKE_CXX_FLAGS "${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${GMX_CXX11_FLAGS} ${CMAKE_CXX_FLAGS}")
      set(CMAKE_EXE_LINKER_FLAGS "${FFT_LINKER_FLAGS} ${MPI_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}")
      set(CMAKE_SHARED_LINKER_FLAGS "${MPI_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
  else()
      message("CMAKE_C_FLAGS: ${ACCELERATION_C_FLAGS} ${MPI_COMPILE_FLAGS} ${GMXC_CFLAGS}")
      message("CMAKE_C_FLAGS_RELEASE: ${GMXC_CFLAGS_RELEASE}")
      message("CMAKE_C_FLAGS_DEBUG: ${GMXC_CFLAGS_DEBUG}")
 -    if(CMAKE_CXX_COMPILER_LOADED)
 -        message("CMAKE_CXX_FLAGS: ${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${GMXC_CXXFLAGS}")
 -        message("CMAKE_CXX_FLAGS_RELEASE: ${GMXC_CXXFLAGS_RELEASE}")
 -        message("CMAKE_CXX_FLAGS_DEBUG: ${GMXC_CXXFLAGS_DEBUG}")
 -    endif()
 +    message("CMAKE_CXX_FLAGS: ${ACCELERATION_CXX_FLAGS} ${MPI_COMPILE_FLAGS} ${GMX_CXX11_FLAGS} ${GMXC_CXXFLAGS}")
 +    message("CMAKE_CXX_FLAGS_RELEASE: ${GMXC_CXXFLAGS_RELEASE}")
 +    message("CMAKE_CXX_FLAGS_DEBUG: ${GMXC_CXXFLAGS_DEBUG}")
      message("CMAKE_EXE_LINKER_FLAGS: ${FFT_LINKER_FLAGS} ${MPI_LINKER_FLAGS}")
      message("CMAKE_SHARED_LINKER_FLAGS: ${MPI_LINKER_FLAGS}")
  endif()
@@@ -1107,9 -1126,8 +1110,9 @@@ if(NOT GMX_OPENMP
      #or because it was only partially detected (e.g. only for C but not C++ compiler)
      unset(OpenMP_C_FLAGS CACHE) 
      unset(OpenMP_CXX_FLAGS CACHE)
 -    unset(OpenMP_LINKER_FLAGS CACHE)
 -    unset(OpenMP_SHARED_LINKER_FLAGS)
 +else()
 +    set(GMX_EXE_LINKER_FLAGS ${GMX_EXE_LINKER_FLAGS} ${OpenMP_LINKER_FLAGS})
 +    set(GMX_SHARED_LINKER_FLAGS ${GMX_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS})
  endif()
  set(PKG_CFLAGS "${PKG_CFLAGS} ${OpenMP_C_FLAGS}")
  
@@@ -1122,8 -1140,15 +1125,8 @@@ if (CMAKE_CXX_COMPILER_LOADED
  endif ()
  
  ########################################################################
 -# Specify install locations and which subdirectories to process        #
 +# Specify install locations
  ########################################################################
 -if (GMX_USE_RELATIVE_INSTALL_PATH)
 -    set(GMX_INSTALL_PREFIX "" CACHE STRING "Prefix gets appended to CMAKE_INSTALL_PREFIX. For cpack it sets the root folder of the archive.")
 -    mark_as_advanced(GMX_INSTALL_PREFIX)
 -else()
 -    set(GMX_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}/")
 -endif()
 -
  if ( NOT DEFINED GMXLIB )
      set(GMXLIB lib)
  endif()
@@@ -1136,43 -1161,21 +1139,43 @@@ set(INCL_INSTALL_DIR ${GMX_INSTALL_PREF
  set(GMXLIBDIR        ${DATA_INSTALL_DIR}/top)
  
  ##################################################################
 -# Shared library settings - Darwin uses INSTALL_NAME_DIR instead!
 +# Shared library settings
  ##################################################################
  if(NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin")
      set(CMAKE_SKIP_BUILD_RPATH  FALSE)
      set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
      set(CMAKE_INSTALL_RPATH "\\\$ORIGIN/../${GMXLIB}")
      set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 +else()
 +    if(CMAKE_SYSTEM_VERSION VERSION_GREATER 8.0) #rpath supported for >10.4
 +        set(CMAKE_INSTALL_NAME_DIR "@rpath")
 +        set(GMX_EXE_LINKER_FLAGS ${GMX_EXE_LINKER_FLAGS} "-Wl,-rpath,@executable_path/../lib")
 +    else()
 +        set(CMAKE_INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/${LIB_INSTALL_DIR}")
 +    endif()
  endif()
  
  #COPYING file: Only necessary for binary distributions.
  #Simpler to always install.
  install(FILES COPYING DESTINATION ${DATA_INSTALL_DIR} COMPONENT data)
  
 +if(GMX_EXTERNAL_BOOST)
 +    include_directories(${Boost_INCLUDE_DIRS})
 +else()
 +    include_directories(${CMAKE_SOURCE_DIR}/src/external/boost)
 +    # typeid not supported for minimal internal version
 +    # (would add significant amount of code)
 +    add_definitions(-DBOOST_NO_TYPEID)
 +    # TODO: Propagate the above settings to the installed CMakeFiles.txt template
 +    # (from share/template/)
 +    set(PKG_CFLAGS "${PKG_CFLAGS} -DBOOST_NO_TYPEID -I${INCL_INSTALL_DIR}/gromacs/external/boost")
 +    install(DIRECTORY ${CMAKE_SOURCE_DIR}/src/external/boost/boost
 +            DESTINATION ${INCL_INSTALL_DIR}/gromacs/external/boost
 +            COMPONENT development)
 +endif()
 +
 +add_subdirectory(doxygen)
  add_subdirectory(share)
 -add_subdirectory(include)
  add_subdirectory(src)
  add_subdirectory(scripts)
  
@@@ -1205,13 -1208,16 +1208,13 @@@ ADD_CUSTOM_TARGET(uninstal
  include(CTest)
  mark_as_advanced(BUILD_TESTING)
  #gmxtests target builds all binaries required for running gmxtest
 -add_custom_target(gmxtests DEPENDS grompp mdrun pdb2gmx gmxcheck editconf)
 +add_custom_target(gmxtests DEPENDS grompp mdrun pdb2gmx gmxcheck gmx links)
  IF(BUILD_TESTING)
      enable_testing()
      add_subdirectory(tests)
 -    if(REGRESSIONTEST_PATH)
 -        #check target builds all to run tests and the runs tests
 -        add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure)
 -        add_dependencies(check gmxtests)
 -    else()
 -        add_custom_target(check COMMAND ${CMAKE_COMMAND} -E echo "WARNING: No tests are run. Running the tests requires either of the cmake variables REGRESSIONTEST_PATH or REGRESSIONTEST_DOWNLOAD to be set.")
 -    endif()
 +    #"check" target builds and runs all tests
 +    add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure)
 +    add_dependencies(check gmxtests tests)
 +    #TODO: Add warning if NOT REGRESSIONTEST_PATH OR NOT GMX_XML that regression/unit tests are not run.
  ENDIF()
  
diff --combined cmake/ThreadMPI.cmake
index fc9183afc2f1a78240caa8b1fbe34496b9cee505,cc6822c74839af8c31e22af3115cba5816b8b0c5..bcd280ae08d48b7d9d2e31d4a1c8c9a740d73231
 -# This source code file is part of thread_mpi.
 -# Written by Sander Pronk, Erik Lindahl, and possibly others.
 -#
 -# Copyright (c) 2009, Sander Pronk, Erik Lindahl.
 -# All rights reserved.
 -#
 -# Redistribution and use in source and binary forms, with or without
 -# modification, are permitted provided that the following conditions are met:
 -# 1) Redistributions of source code must retain the above copyright
 -# notice, this list of conditions and the following disclaimer.
 -# 2) Redistributions in binary form must reproduce the above copyright
 -# notice, this list of conditions and the following disclaimer in the
 -# documentation and/or other materials provided with the distribution.
 -# 3) Neither the name of the copyright holders nor the
 -# names of its contributors may be used to endorse or promote products
 -# derived from this software without specific prior written permission.
 -#
 -# THIS SOFTWARE IS PROVIDED BY US ''AS IS'' AND ANY
 -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 -# DISCLAIMED. IN NO EVENT SHALL WE BE LIABLE FOR ANY
 -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 -#
 -# If you want to redistribute modifications, please consider that
 -# scientific software is very special. Version control is crucial -
 -# bugs must be traceable. We will be happy to consider code for
 -# inclusion in the official distribution, but derived work should not
 -# be called official thread_mpi. Details are found in the README & COPYING
 -# files.
  
  include(CheckIncludeFiles)
  include(CheckFunctionExists)
- #include(CheckCSourceCompiles)
- #option(THREAD_PTHREADS "Use posix threads" ON)
+ include(CheckCSourceCompiles)
  
- MACRO(TEST_TMPI_ATOMICS VARIABLE)
+ # sets TMPI_ATOMICS to 1 if atomic operations are found, 0 otherwise
+ MACRO(TMPI_TEST_ATOMICS)
      if (NOT DEFINED TMPI_ATOMICS)
          try_compile(TEST_ATOMICS "${CMAKE_BINARY_DIR}"
                  "${CMAKE_SOURCE_DIR}/cmake/TestAtomics.c"
 -                COMPILE_DEFINITIONS "-I${CMAKE_SOURCE_DIR}/include" )
 +                COMPILE_DEFINITIONS "-I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders" )
 +
          if (TEST_ATOMICS)
-             message(STATUS "Atomics found")
-             set(${VARIABLE} TRUE CACHE INTERNAL "Whether atomic operations for thread-MPI were found")
+             message(STATUS "Atomic operations found")
          else (TEST_ATOMICS)
-             message(WARNING "Atomic operations not found for this CPU+compiler combination. Thread support will be unbearably slow: disable threads. Atomic operations should work on all but the most obscure CPU+compiler combinations; if your system is not obscure -- like, for example, x86 with gcc --  please contact the developers.")
-             set(${VARIABLE} FALSE CACHE INTERNAL "Whether atomic operations for thread-MPI were found")
+             message(STATUS "Atomic operations not found")
          endif(TEST_ATOMICS)
+         set(TMPI_ATOMICS ${TEST_ATOMICS} CACHE INTERNAL "Whether atomic operations are found")
      endif(NOT DEFINED TMPI_ATOMICS)
- ENDMACRO(TEST_TMPI_ATOMICS VARIABLE)
+ ENDMACRO(TMPI_TEST_ATOMICS VARIABLE)
  
- MACRO(TMPI_MAKE_CXX_LIB)
-     set(TMPI_CXX_LIB 1)
- ENDMACRO(TMPI_MAKE_CXX_LIB)
+ TMPI_TEST_ATOMICS()
  
- MACRO(TMPI_GET_SOURCE_LIST SRC_VARIABLE)
+ include(FindThreads)
+ if (CMAKE_USE_PTHREADS_INIT)
+     check_include_files(pthread.h    HAVE_PTHREAD_H)
+     set(THREAD_PTHREADS 1)
+     set(THREAD_LIB ${CMAKE_THREAD_LIBS_INIT})
+ elseif (CMAKE_USE_WIN32_THREADS_INIT)
+     set(THREAD_WINDOWS 1)
+     set(THREAD_LIB)
+ else ()
+     message(FATAL_ERROR "Thread support required")
+ endif (CMAKE_USE_PTHREADS_INIT)
+ # Turns on thread_mpi.
+ # options are:
+ # CXX: enable C++ library build.
+ MACRO(TMPI_ENABLE)
+     # first check whether threads and atomics are available.
+     if(NOT TMPI_ATOMICS)
+         # check again, to allow the user to fix this.
+         unset(TMPI_ATOMICS CACHE)
+         TMPI_TEST_ATOMICS()
+     endif(NOT TMPI_ATOMICS)
+     if(NOT TMPI_ATOMICS)
+         message(WARNING "Atomic operations not found for this CPU+compiler combination. Thread support will be unbearably slow: disable threads. Atomic operations should work on all but the most obscure CPU+compiler combinations; if your system is not obscure -- like, for example, x86 with gcc --  please contact the developers.")
+     endif(NOT TMPI_ATOMICS)
+     set(TMPI_ENABLED 1)
      foreach (_option IN ITEMS ${ARGN})
          if (_option STREQUAL "CXX")
              set(TMPI_CXX_LIB 1)
 -        #elseif (_option STREQUAL "NOMPI")
 -        #            set(TMPI_NO_MPI_LIB 1)
 +        elseif (_option STREQUAL "NOMPI")
 +            set(TMPI_NO_MPI_LIB 1)
          else ()
              message(FATAL_ERROR "Unknown thread_mpi option '${_option}'")
          endif ()
      endforeach ()
+     #tmpi_test_atomics(TMPI_ATOMICS)
+ # the spin-waiting option
+     option(THREAD_MPI_WAIT_FOR_NO_ONE "Use busy waits without yielding to the OS scheduler. Turning this on might improve performance (very) slightly at the cost of very poor performance if the threads are competing for CPU time." OFF)
+     mark_as_advanced(THREAD_MPI_WAIT_FOR_NO_ONE)
+     if (THREAD_MPI_WAIT_FOR_NO_ONE)
+         set(TMPI_WAIT_FOR_NO_ONE 1)
+     else (THREAD_MPI_WAIT_FOR_NO_ONE)
+         set(TMPI_WAIT_FOR_NO_ONE 0)
+     endif (THREAD_MPI_WAIT_FOR_NO_ONE)
+ # the copy buffer option
+     option(THREAD_MPI_COPY_BUFFER "Use an intermediate copy buffer for small message sizes, to allow blocking sends to return quickly. Only useful in programs with relatively uncoupled threads (infrequent MPI communication)" OFF)
+     mark_as_advanced(THREAD_MPI_COPY_BUFFER)
+     if (THREAD_MPI_COPY_BUFFER)
+         set(TMPI_COPY_BUFFER 1)
+     else (THREAD_MPI_COPY_BUFFER)
+         set(TMPI_COPY_BUFFER 0)
+     endif (THREAD_MPI_COPY_BUFFER)
+ # the profiling option
+     option(THREAD_MPI_PROFILING "Turn on simple MPI profiling." OFF)
+     mark_as_advanced(THREAD_MPI_PROFILING)
+     if (THREAD_MPI_PROFILING)
+         set(TMPI_PROFILE 1)
+     else (THREAD_MPI_PROFILING)
+         set(TMPI_PROFILE 0)
+     endif (THREAD_MPI_PROFILING)
+ # tmpi warnings for testing
+     option(THREAD_MPI_WARNINGS "Turn thread_mpi warnings for testing." OFF)
+     mark_as_advanced(THREAD_MPI_WARNINGS)
+     if (THREAD_MPI_WARNINGS)
+         set(TMPI_WARNINGS 1)
+     else (THREAD_MPI_WARNINGS)
+         set(TMPI_WARNINGS 0)
+     endif (THREAD_MPI_WARNINGS)
+     include(CheckCSourceCompiles)
+ # affinity checks
+     include(CheckFunctionExists)
+     if (THREAD_PTHREADS)
+         set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
+         # check for sched_setaffinity
+         check_c_source_compiles(
+             "#define _GNU_SOURCE
+ #include <pthread.h>
+ #include <stdlib.h>
+ #include <stdio.h>
+ #include <errno.h>
+     int main(void) { cpu_set_t set;
+         CPU_ZERO(&set);
+         CPU_SET(0, &set);
+         pthread_setaffinity_np(pthread_self(), sizeof(set), &set);
+         return 0;
+     }"
+             PTHREAD_SETAFFINITY
+         )
+         if (PTHREAD_SETAFFINITY)
+             set(HAVE_PTHREAD_SETAFFINITY 1)
+         endif (PTHREAD_SETAFFINITY)
+         set(CMAKE_REQUIRED_LIBRARIES)
+     endif (THREAD_PTHREADS)
+ # this runs on POSIX systems
+     check_include_files(unistd.h        HAVE_UNISTD_H)
+     check_include_files(sched.h         HAVE_SCHED_H)
+     check_include_files(sys/time.h      HAVE_SYS_TIME_H)
+     check_function_exists(sysconf       HAVE_SYSCONF)
+ # this runs on windows
+ #check_include_files(windows.h                HAVE_WINDOWS_H)
+ ENDMACRO(TMPI_ENABLE)
+ MACRO(TMPI_GET_SOURCE_LIST SRC_VARIABLE)
      set(${SRC_VARIABLE}
          thread_mpi/errhandler.c
-         thread_mpi/tmpi_malloc.c)
+         thread_mpi/tmpi_malloc.c
+         thread_mpi/atomic.c)
      if (THREAD_PTHREADS)
          list(APPEND ${SRC_VARIABLE} thread_mpi/pthreads.c)
      elseif (THREAD_WINDOWS)
      if (TMPI_CXX_LIB)
          list(APPEND ${SRC_VARIABLE} thread_mpi/system_error.cpp)
      endif (TMPI_CXX_LIB)
-     if (NOT TMPI_NO_MPI_LIB)
+     if (TMPI_ENABLED)
          list(APPEND ${SRC_VARIABLE}
               thread_mpi/alltoall.c      thread_mpi/p2p_protocol.c
               thread_mpi/barrier.c       thread_mpi/p2p_send_recv.c
      endif()
  ENDMACRO(TMPI_GET_SOURCE_LIST)
  
- test_tmpi_atomics(TMPI_ATOMICS)
- include(FindThreads)
- if (CMAKE_USE_PTHREADS_INIT)
-     check_include_files(pthread.h    HAVE_PTHREAD_H)
-     set(THREAD_PTHREADS 1)
-     #add_definitions(-DTHREAD_PTHREADS)
-     set(THREAD_LIB ${CMAKE_THREAD_LIBS_INIT})
- elseif (CMAKE_USE_WIN32_THREADS_INIT)
-     set(THREAD_WINDOWS 1)
-     #add_definitions(-DTHREAD_WINDOWS)
-     set(THREAD_LIB)
- else ()
-     message(FATAL_ERROR "Thread support required")
- endif (CMAKE_USE_PTHREADS_INIT)
- # the spin-waiting option
- option(THREAD_MPI_WAIT_FOR_NO_ONE "Use busy waits without yielding to the OS scheduler. Turning this on might improve performance (very) slightly at the cost of very poor performance if the threads are competing for CPU time." OFF)
- mark_as_advanced(THREAD_MPI_WAIT_FOR_NO_ONE)
- if (THREAD_MPI_WAIT_FOR_NO_ONE)
-     add_definitions(-DTMPI_WAIT_FOR_NO_ONE)
- else (THREAD_MPI_WAIT_FOR_NO_ONE)
-     add_definitions()
- endif (THREAD_MPI_WAIT_FOR_NO_ONE)
- # the copy buffer option
- option(THREAD_MPI_COPY_BUFFER "Use an intermediate copy buffer for small message sizes, to allow blocking sends to return quickly." ON)
- mark_as_advanced(THREAD_MPI_COPY_BUFFER)
- if (THREAD_MPI_COPY_BUFFER)
-     add_definitions()
- else (THREAD_MPI_COPY_BUFFER)
-     add_definitions(-DTMPI_NO_COPY_BUFFER)
- endif (THREAD_MPI_COPY_BUFFER)
- # the profiling option
- option(THREAD_MPI_PROFILING "Turn on simple MPI profiling." OFF)
- mark_as_advanced(THREAD_MPI_PROFILING)
- if (THREAD_MPI_PROFILING)
-     add_definitions(-DTMPI_PROFILE)
- else (THREAD_MPI_PROFILING)
-     add_definitions()
- endif (THREAD_MPI_PROFILING)
- include(CheckCSourceCompiles)
- # option to set affinity 
- option(THREAD_MPI_SET_AFFINITY "Set thread affinity to a core if number of threads equal to number of hardware threads." ON)
- mark_as_advanced(THREAD_MPI_SET_AFFINITY)
- if (THREAD_MPI_SET_AFFINITY)
-     add_definitions(-DTMPI_SET_AFFINITY)
- else (THREAD_MPI_SET_AFFINITY)
-     add_definitions()
- endif (THREAD_MPI_SET_AFFINITY)
- include(CheckFunctionExists)
- if (THREAD_PTHREADS)
-     set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
-     # check for sched_setaffinity
-     check_c_source_compiles(
-         "#define _GNU_SOURCE
- #include <pthread.h>
- #include <stdlib.h>
- #include <stdio.h>
- #include <errno.h>
- int main(void) { cpu_set_t set;
-     CPU_ZERO(&set);
-     CPU_SET(0, &set);
-     pthread_setaffinity_np(pthread_self(), sizeof(set), &set);
-     return 0;
- }"
-         PTHREAD_SETAFFINITY
-     )
-     if (PTHREAD_SETAFFINITY)
-         set(HAVE_PTHREAD_SETAFFINITY 1)
-     endif (PTHREAD_SETAFFINITY)
-     set(CMAKE_REQUIRED_LIBRARIES)
- endif (THREAD_PTHREADS)
- # this runs on POSIX systems
- check_include_files(unistd.h        HAVE_UNISTD_H)
- check_include_files(sched.h         HAVE_SCHED_H)
- check_include_files(sys/time.h      HAVE_SYS_TIME_H)
- check_function_exists(sysconf       HAVE_SYSCONF)
- # this runs on windows
- #check_include_files(windows.h                HAVE_WINDOWS_H)
diff --combined src/config.h.cmakein
index 2c003ba0245230b628ef7f216721b50d88d83f15,ffa22a1fab7bae5ea8f31b33d3ce167d8c5ad964..619ee23fba54238f3b2b29bee9c70ba72cea7761
@@@ -1,16 -1,38 +1,16 @@@
 -/*
 - * This file is part of the GROMACS molecular simulation package.
 +/*! \libinternal \file
 + * \brief
 + * Include file for configuration macros from the build system.
   *
 - * Copyright (c) 2012, by the GROMACS development team, led by
 - * David van der Spoel, Berk Hess, Erik Lindahl, and including many
 - * others, as listed in the AUTHORS file in the top-level source
 - * directory and at http://www.gromacs.org.
 + * This header is not installed, so headers must not reference macros defined
 + * here.
   *
 - * GROMACS is free software; you can redistribute it and/or
 - * modify it under the terms of the GNU Lesser General Public License
 - * as published by the Free Software Foundation; either version 2.1
 - * of the License, or (at your option) any later version.
 - *
 - * GROMACS is distributed in the hope that it will be useful,
 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 - * Lesser General Public License for more details.
 - *
 - * You should have received a copy of the GNU Lesser General Public
 - * License along with GROMACS; if not, see
 - * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 - *
 - * If you want to redistribute modifications to GROMACS, please
 - * consider that scientific software is very special. Version
 - * control is crucial - bugs must be traceable. We will be happy to
 - * consider code for inclusion in the official distribution, but
 - * derived work must not be called official GROMACS. Details are found
 - * in the README & COPYING files - if they are missing, get the
 - * official version at http://www.gromacs.org.
 - *
 - * To help us fund GROMACS development, we humbly ask that you cite
 - * the research papers on the package. Check out http://www.gromacs.org.
 + * \inlibraryapi
   */
 -/* Manually created from config.h.in to work with cmake */
 +#include "gromacs/utility/gmx_header_config.h"
 +
 +/* TODO: For now, disable Doxygen warnings from here */
 +/*! \cond */
  
  /* Disable warnings about double-to-float conversion accuracy loss on MSVC */
  #ifdef _MSC_VER
  #pragma warning (disable : 4090)
  #endif
  
 -#include "gmx_header_config.h"
 -
  /* Name of package (translate from cmake to autoconf macro name) */
  #define PACKAGE  "@PROJECT_NAME@"
  
  /* Version number of package (translate from cmake to autoconf macro name) */
  #define VERSION  "@PROJECT_VERSION@"
  
 -/* Use the version string from generated version.h */
 -#cmakedefine USE_VERSION_H
 +/* Use extra version information generated with git */
 +#cmakedefine GMX_GIT_VERSION_INFO
  
  /* Default location of data files */
  #define GMXLIBDIR "@GMXLIBDIR@"
  
 +/* Binary suffix for the created binaries */
 +#define GMX_BINARY_SUFFIX "@GMX_BINARY_SUFFIX@"
 +
 +/* Source directory for the build */
 +#cmakedefine CMAKE_SOURCE_DIR "@CMAKE_SOURCE_DIR@"
 +
 +/* Binary directory for the build */
 +#cmakedefine CMAKE_BINARY_DIR "@CMAKE_BINARY_DIR@"
 +
  /* Turn off water-water neighborlist optimization only - not used right now */
  #cmakedefine DISABLE_WATERWATER_NLIST
  
  /* IBM QPX was selected as CPU acceleration type (e.g. BlueGene/Q) */
  #cmakedefine GMX_CPU_ACCELERATION_IBM_QPX
  
+ /* Fujitsu Sparc64 HPC-ACE SIMD acceleration */
+ #cmakedefine GMX_CPU_ACCELERATION_SPARC64_HPC_ACE
  /* String for CPU acceleration choice (for writing to log files and stdout) */
  #define GMX_CPU_ACCELERATION_STRING "@GMX_CPU_ACCELERATION@"
  
  /* Define when Windows threads are used */
  #cmakedefine THREAD_WINDOWS
  
- /* Define when thread-MPI atomic operations are available */
+ /* Define when there is a pthread.h */
+ #cmakedefine HAVE_PTHREAD_H
+ /* Define native atomic operations are found */
  #cmakedefine TMPI_ATOMICS
  
  /* Define for busy wait option  */
  /* Define for copy buffer option */
  #cmakedefine TMPI_COPY_BUFFER
  
+ /* Define for tmpi warnings option */
+ #cmakedefine TMPI_WARNINGS
  /* Define for profiling option */
  #cmakedefine TMPI_PROFILE
  
- /* Define for Linux pthread_setaffinity */
+ /* Define for Linux pthread_setaffinity_np */
  #cmakedefine HAVE_PTHREAD_SETAFFINITY
  
- /* Define for sysconf() */
- #cmakedefine HAVE_SYSCONF
+ /* Define for Windows NUMA-aware allocator functions*/
+ #cmakedefine TMPI_WINDOWS_NUMA_API
+ /* Define for GetSystemInfo() */
+ #cmakedefine HAVE_SYSTEM_INFO
  
  /* Enable x86 gcc inline assembly */
  #cmakedefine GMX_X86_GCC_INLINE_ASM
  /* Use GPU native acceleration */
  #cmakedefine GMX_GPU
  
 -/* Define to 1 if the system has the type gmx_bool. */
 -#cmakedefine HAVE_BOOL
 -
  /* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
  #cmakedefine HAVE_FSEEKO
  
  /* Define to 1 if _fseeki64 (and presumably _fseeki64) exists and is declared. */
  #cmakedefine HAVE__FSEEKI64
  
 -/* Define to 1 if you have the m library (-lm). */
 -#cmakedefine HAVE_LIBM
 -
 -/* Define to 1 if you have the mkl library (-lmkl). */
 -#cmakedefine HAVE_LIBMKL
 -
  /* Define to 1 if you have the gsl library (-lgsl). */
  #cmakedefine HAVE_LIBGSL
  
 -/* Define to 1 if you have the dl library (-ldl). */
 -#cmakedefine HAVE_LIBDL
 -
  /* Have io.h (windows)*/
  #cmakedefine HAVE_IO_H
  
 -/* Define to 1 if you have the strcasecmp() function. */
 -#cmakedefine HAVE_STRCASECMP
 -
  /* Define to 1 if you have the strdup() function. */
  #cmakedefine HAVE_STRDUP
  
 -/* Define to 1 if you have the vfprintf() function. */
 -#cmakedefine HAVE_VFPRINTF
 -
 -/* Define to 1 if you have the memcmp() function. */
 -#cmakedefine HAVE_MEMCMP
 -
  /* Define to 1 if you have the posix_memalign() function. */
  #cmakedefine HAVE_POSIX_MEMALIGN
  
  /* Define to 1 if you have the cbrt() function. */
  #cmakedefine HAVE_CBRT
  
 -/* Define to 1 if you have the isnan() function. */
 -#cmakedefine HAVE_ISNAN
 -
 -/* Define to 1 if you have the _isnan() function. */
 -#cmakedefine HAVE__ISNAN
 -
  /* Define to 1 if you have the isfinite() function. */
  #cmakedefine HAVE_ISFINITE
  
  /* Define to 1 if you have the sqrtf() function. */
  #cmakedefine HAVE_SQRTF
  
 -/* Define to 1 if you have the <string.h> header file. */
 -#cmakedefine HAVE_STRING_H
 -
 -/* Define to 1 if yo have the <math.h> header file. */
 -#cmakedefine HAVE_MATH_H
 -
 -/* Define to 1 if yo have the <limits.h> header file. */
 -#cmakedefine HAVE_LIMITS_H
 -
 -/* Define to 1 if yo have the <memory.h> header file. */
 -#cmakedefine HAVE_MEMORY_H
 -
  /* Define to 1 if yo have the <unistd.h> header file. */
  #cmakedefine HAVE_UNISTD_H
  
 -/* Define to 1 if yo have the <direct.h> header file. */
 -#cmakedefine HAVE_DIRECT_H
 -
  /* Define to 1 if yo have the <pwd.h> header file. */
  #cmakedefine HAVE_PWD_H
  
 -/* Define to 1 if yo have the <stdint.h> header file. */
 -#cmakedefine HAVE_STDINT_H
 -
 -/* Define to 1 if yo have the <stdlib.h> header file. */
 -#cmakedefine HAVE_STDLIB_H
 +/* Define to 1 if yo have the <pthread.h> header file. */
 +#cmakedefine HAVE_PTHREAD_H
  
  /* Define to 1 if yo have the <dirent.h> header file. */
  #cmakedefine HAVE_DIRENT_H
  
 -/* Define to 1 if yo have the <inttypes.h> header file. */
 -#cmakedefine HAVE_INTTYPES_H
 -
 -/* Define to 1 if yo have the <regex.h> header file. */
 -#cmakedefine HAVE_REGEX_H
 -
 -/* Define to 1 if you have the <sys/types.h> header file. */
 -#cmakedefine HAVE_SYS_TYPES_H
 -
 -/* Define to 1 if you have the <sys/stat.h> header file. */
 -#cmakedefine HAVE_SYS_STAT_H
 -
  /* Define to 1 if you have the <sys/time.h> header file. */
  #cmakedefine HAVE_SYS_TIME_H
  
 -/* Define to 1 if you have the <rpc/rpc.h> header file. */
 -#cmakedefine HAVE_RPC_RPC_H
 -
 -/* Define to 1 if you have the <rpc/xdr.h> header file. */
 -#cmakedefine HAVE_RPC_XDR_H
 -
  /* Define to 1 if you have the <x86intrin.h> header file */
  #cmakedefine HAVE_X86INTRIN_H
  
  /* Define to 1 if you have the <sched.h> header */
  #cmakedefine HAVE_SCHED_H
  
 -/* Define to 1 if you have the vprintf() function. */
 -#cmakedefine HAVE_VPRINTF
 +/* Define to 1 if you have the POSIX <regex.h> header file. */
 +#cmakedefine HAVE_POSIX_REGEX
 +
 +/* Define to 1 if you have the C++11 <regex> header file. */
 +#cmakedefine HAVE_CXX11_REGEX
  
  /* Define to 1 if you have the sysconf() function */
  #cmakedefine HAVE_SYSCONF
  /* Some systems requires this to be set to 64 for large file support */
  #cmakedefine _FILE_OFFSET_BITS @_FILE_OFFSET_BITS@
  
 -/* Gromacs shortcut define for fseeko & ftello being present with 64-bit support */
 -#cmakedefine GMX_LARGEFILES
 -
 -/* Define to int if <sys/types.h> does not define. */
 -#cmakedefine gid_t int
 -
  /* Define to __inline__ or __inline if that is what the C compiler
     calls it, or to nothing if inline is not supported under any name.
     Please do NOT remove the gmx_inline keyword from here. The classical
     to identify the language standard level. If it is not supported, it
     is still defined to an empty string here. */
  #define gmx_restrict ${RESTRICT_KEYWORD}
 -
 -#ifndef CPLUSPLUS
 -#ifdef __cplusplus
 -#define CPLUSPLUS
 -#endif
 -#endif  
 -
 -/* Define to long int if <sys/types.h> does not define. */                    
 -#cmakedefine off_t int
 -
 -/* Define to unsigned int if <sys/types.h> does not define. */
 -#cmakedefine size_t int
 -
 -/* Define to int if <sys/types.h> does not define. */
 -#cmakedefine uid_t int
 -
  /* Build special-purpose mdrun library */
  #cmakedefine GMX_FAHCORE   
  
  /* Define if we have pipes */
  #cmakedefine HAVE_PIPES
  
 -
  /* Catch stupid CMake problems on OS X */
  #ifdef __APPLE__
  #  if ((defined(__LP64__) && __LP64__ && defined(SIZEOF_VOIDP) && SIZEOF_VOIDP<8) || ( (!defined(__LP64__) || __LP64__==0) && (defined(SIZEOF_VOIDP) && SIZEOF_VOIDP>4)))
  #    error "or create a new such entry with your choice in the GUI _before_ hitting 'configure'."
  #  endif
  #endif
 +
 +/*! \endcond */
index 122d3d65933c9c7f410d5b74131a37fe3633681c,0000000000000000000000000000000000000000..4a24ea203801a73703171f44fb694171a036a3bf
mode 100644,000000..100644
--- /dev/null
@@@ -1,644 -1,0 +1,661 @@@
-         { "-neutral", FALSE, etBOOL, {&bNeutral},
-           "This option will add enough ions to neutralize the system. In combination with the concentration option a neutral system at a given salt concentration will be generated." }
 +/*
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Green Red Orange Magenta Azure Cyan Skyblue
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <ctype.h>
 +#include "copyrite.h"
 +#include "string2.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "confio.h"
 +#include "statutil.h"
 +#include "pbc.h"
 +#include "force.h"
 +#include "gmx_fatal.h"
 +#include "futil.h"
 +#include "maths.h"
 +#include "macros.h"
 +#include "physics.h"
 +#include "vec.h"
 +#include "tpxio.h"
 +#include "mdrun.h"
 +#include "calcpot.h"
 +#include "main.h"
 +#include "random.h"
 +#include "index.h"
 +#include "mtop_util.h"
 +#include "gmx_ana.h"
 +
++static int greatest_common_divisor(int p, int q)
++{
++    int tmp;
++    while (q != 0)
++    {
++        tmp = q;
++        q = p % q;
++        p = tmp;
++    }
++    return p;
++}
++
 +static void insert_ion(int nsa, int *nwater,
 +                       gmx_bool bSet[], int repl[], atom_id index[],
 +                       real pot[], rvec x[], t_pbc *pbc,
 +                       int sign, int q, const char *ionname,
 +                       t_mdatoms *mdatoms,
 +                       real rmin, gmx_bool bRandom, int *seed)
 +{
 +    int             i, ii, ei, owater, wlast, m, nw;
 +    real            extr_e, poti, rmin2;
 +    rvec            xei, dx;
 +    gmx_bool        bSub = FALSE;
 +    gmx_large_int_t maxrand;
 +
 +    ei       = -1;
 +    nw       = *nwater;
 +    maxrand  = nw;
 +    maxrand *= 1000;
 +    if (bRandom)
 +    {
 +        do
 +        {
 +            ei = nw*rando(seed);
 +            maxrand--;
 +        }
 +        while (bSet[ei] && (maxrand > 0));
 +        if (bSet[ei])
 +        {
 +            gmx_fatal(FARGS, "No more replaceable solvent!");
 +        }
 +    }
 +    else
 +    {
 +        extr_e = 0;
 +        for (i = 0; (i < nw); i++)
 +        {
 +            if (!bSet[i])
 +            {
 +                ii   = index[nsa*i];
 +                poti = pot[ii];
 +                if (q > 0)
 +                {
 +                    if ((poti <= extr_e) || !bSub)
 +                    {
 +                        extr_e = poti;
 +                        ei     = i;
 +                        bSub   = TRUE;
 +                    }
 +                }
 +                else
 +                {
 +                    if ((poti >= extr_e) || !bSub)
 +                    {
 +                        extr_e = poti;
 +                        ei     = i;
 +                        bSub   = TRUE;
 +                    }
 +                }
 +            }
 +        }
 +        if (ei == -1)
 +        {
 +            gmx_fatal(FARGS, "No more replaceable solvent!");
 +        }
 +    }
 +    fprintf(stderr, "Replacing solvent molecule %d (atom %d) with %s\n",
 +            ei, index[nsa*ei], ionname);
 +
 +    /* Replace solvent molecule charges with ion charge */
 +    bSet[ei] = TRUE;
 +    repl[ei] = sign;
 +    mdatoms->chargeA[index[nsa*ei]] = q;
 +    for (i = 1; i < nsa; i++)
 +    {
 +        mdatoms->chargeA[index[nsa*ei+i]] = 0;
 +    }
 +
 +    /* Mark all solvent molecules within rmin as unavailable for substitution */
 +    if (rmin > 0)
 +    {
 +        rmin2 = rmin*rmin;
 +        for (i = 0; (i < nw); i++)
 +        {
 +            if (!bSet[i])
 +            {
 +                pbc_dx(pbc, x[index[nsa*ei]], x[index[nsa*i]], dx);
 +                if (iprod(dx, dx) < rmin2)
 +                {
 +                    bSet[i] = TRUE;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static char *aname(const char *mname)
 +{
 +    char *str;
 +    int   i;
 +
 +    str = strdup(mname);
 +    i   = strlen(str)-1;
 +    while (i > 1 && (isdigit(str[i]) || (str[i] == '+') || (str[i] == '-')))
 +    {
 +        str[i] = '\0';
 +        i--;
 +    }
 +
 +    return str;
 +}
 +
 +void sort_ions(int nsa, int nw, int repl[], atom_id index[],
 +               t_atoms *atoms, rvec x[],
 +               const char *p_name, const char *n_name)
 +{
 +    int    i, j, k, r, np, nn, starta, startr, npi, nni;
 +    rvec  *xt;
 +    char **pptr = NULL, **nptr = NULL, **paptr = NULL, **naptr = NULL;
 +
 +    snew(xt, atoms->nr);
 +
 +    /* Put all the solvent in front and count the added ions */
 +    np = 0;
 +    nn = 0;
 +    j  = index[0];
 +    for (i = 0; i < nw; i++)
 +    {
 +        r = repl[i];
 +        if (r == 0)
 +        {
 +            for (k = 0; k < nsa; k++)
 +            {
 +                copy_rvec(x[index[nsa*i+k]], xt[j++]);
 +            }
 +        }
 +        else if (r > 0)
 +        {
 +            np++;
 +        }
 +        else if (r < 0)
 +        {
 +            nn++;
 +        }
 +    }
 +
 +    if (np+nn > 0)
 +    {
 +        /* Put the positive and negative ions at the end */
 +        starta = index[nsa*(nw - np - nn)];
 +        startr = atoms->atom[starta].resind;
 +
 +        if (np)
 +        {
 +            snew(pptr, 1);
 +            pptr[0] = strdup(p_name);
 +            snew(paptr, 1);
 +            paptr[0] = aname(p_name);
 +        }
 +        if (nn)
 +        {
 +            snew(nptr, 1);
 +            nptr[0] = strdup(n_name);
 +            snew(naptr, 1);
 +            naptr[0] = aname(n_name);
 +        }
 +        npi = 0;
 +        nni = 0;
 +        for (i = 0; i < nw; i++)
 +        {
 +            r = repl[i];
 +            if (r > 0)
 +            {
 +                j = starta+npi;
 +                k = startr+npi;
 +                copy_rvec(x[index[nsa*i]], xt[j]);
 +                atoms->atomname[j]     = paptr;
 +                atoms->atom[j].resind  = k;
 +                atoms->resinfo[k].name = pptr;
 +                npi++;
 +            }
 +            else if (r < 0)
 +            {
 +                j = starta+np+nni;
 +                k = startr+np+nni;
 +                copy_rvec(x[index[nsa*i]], xt[j]);
 +                atoms->atomname[j]     = naptr;
 +                atoms->atom[j].resind  = k;
 +                atoms->resinfo[k].name = nptr;
 +                nni++;
 +            }
 +        }
 +        for (i = index[nsa*nw-1]+1; i < atoms->nr; i++)
 +        {
 +            j                  = i-(nsa-1)*(np+nn);
 +            atoms->atomname[j] = atoms->atomname[i];
 +            atoms->atom[j]     = atoms->atom[i];
 +            copy_rvec(x[i], xt[j]);
 +        }
 +        atoms->nr -= (nsa-1)*(np+nn);
 +
 +        /* Copy the new positions back */
 +        for (i = index[0]; i < atoms->nr; i++)
 +        {
 +            copy_rvec(xt[i], x[i]);
 +        }
 +        sfree(xt);
 +    }
 +}
 +
 +static void update_topol(const char *topinout, int p_num, int n_num,
 +                         const char *p_name, const char *n_name, char *grpname)
 +{
 +#define TEMP_FILENM "temp.top"
 +    FILE    *fpin, *fpout;
 +    char     buf[STRLEN], buf2[STRLEN], *temp, **mol_line = NULL;
 +    int      line, i, nsol, nmol_line, sol_line, nsol_last;
 +    gmx_bool bMolecules;
 +
 +    printf("\nProcessing topology\n");
 +    fpin  = ffopen(topinout, "r");
 +    fpout = ffopen(TEMP_FILENM, "w");
 +
 +    line       = 0;
 +    bMolecules = FALSE;
 +    nmol_line  = 0;
 +    sol_line   = -1;
 +    nsol_last  = -1;
 +    while (fgets(buf, STRLEN, fpin))
 +    {
 +        line++;
 +        strcpy(buf2, buf);
 +        if ((temp = strchr(buf2, '\n')) != NULL)
 +        {
 +            temp[0] = '\0';
 +        }
 +        ltrim(buf2);
 +        if (buf2[0] == '[')
 +        {
 +            buf2[0] = ' ';
 +            if ((temp = strchr(buf2, '\n')) != NULL)
 +            {
 +                temp[0] = '\0';
 +            }
 +            rtrim(buf2);
 +            if (buf2[strlen(buf2)-1] == ']')
 +            {
 +                buf2[strlen(buf2)-1] = '\0';
 +                ltrim(buf2);
 +                rtrim(buf2);
 +                bMolecules = (gmx_strcasecmp(buf2, "molecules") == 0);
 +            }
 +            fprintf(fpout, "%s", buf);
 +        }
 +        else if (!bMolecules)
 +        {
 +            fprintf(fpout, "%s", buf);
 +        }
 +        else
 +        {
 +            /* Check if this is a line with solvent molecules */
 +            sscanf(buf, "%s", buf2);
 +            if (gmx_strcasecmp(buf2, grpname) == 0)
 +            {
 +                sol_line = nmol_line;
 +                sscanf(buf, "%*s %d", &nsol_last);
 +            }
 +            /* Store this molecules section line */
 +            srenew(mol_line, nmol_line+1);
 +            mol_line[nmol_line] = strdup(buf);
 +            nmol_line++;
 +        }
 +    }
 +    ffclose(fpin);
 +
 +    if (sol_line == -1)
 +    {
 +        ffclose(fpout);
 +        gmx_fatal(FARGS, "No line with moleculetype '%s' found the [ molecules ] section of file '%s'", grpname, topinout);
 +    }
 +    if (nsol_last < p_num+n_num)
 +    {
 +        ffclose(fpout);
 +        gmx_fatal(FARGS, "The last entry for moleculetype '%s' in the [ molecules ] section of file '%s' has less solvent molecules (%d) than were replaced (%d)", grpname, topinout, nsol_last, p_num+n_num);
 +    }
 +
 +    /* Print all the molecule entries */
 +    for (i = 0; i < nmol_line; i++)
 +    {
 +        if (i != sol_line)
 +        {
 +            fprintf(fpout, "%s", mol_line[i]);
 +        }
 +        else
 +        {
 +            printf("Replacing %d solute molecules in topology file (%s) "
 +                   " by %d %s and %d %s ions.\n",
 +                   p_num+n_num, topinout, p_num, p_name, n_num, n_name);
 +            nsol_last -= p_num + n_num;
 +            if (nsol_last > 0)
 +            {
 +                fprintf(fpout, "%-10s  %d\n", grpname, nsol_last);
 +            }
 +            if (p_num > 0)
 +            {
 +                fprintf(fpout, "%-15s  %d\n", p_name, p_num);
 +            }
 +            if (n_num > 0)
 +            {
 +                fprintf(fpout, "%-15s  %d\n", n_name, n_num);
 +            }
 +        }
 +    }
 +    ffclose(fpout);
 +    /* use ffopen to generate backup of topinout */
 +    fpout = ffopen(topinout, "w");
 +    ffclose(fpout);
 +    rename(TEMP_FILENM, topinout);
 +#undef TEMP_FILENM
 +}
 +
 +int gmx_genion(int argc, char *argv[])
 +{
 +    const char        *desc[] = {
 +        "[TT]genion[tt] replaces solvent molecules by monoatomic ions at",
 +        "the position of the first atoms with the most favorable electrostatic",
 +        "potential or at random. The potential is calculated on all atoms, using",
 +        "normal GROMACS particle-based methods (in contrast to other methods",
 +        "based on solving the Poisson-Boltzmann equation).",
 +        "The potential is recalculated after every ion insertion.",
 +        "If specified in the run input file, a reaction field, shift function",
 +        "or user function can be used. For the user function a table file",
 +        "can be specified with the option [TT]-table[tt].",
 +        "The group of solvent molecules should be continuous and all molecules",
 +        "should have the same number of atoms.",
 +        "The user should add the ion molecules to the topology file or use",
 +        "the [TT]-p[tt] option to automatically modify the topology.[PAR]",
 +        "The ion molecule type, residue and atom names in all force fields",
 +        "are the capitalized element names without sign. This molecule name",
 +        "should be given with [TT]-pname[tt] or [TT]-nname[tt], and the",
 +        "[TT][molecules][tt] section of your topology updated accordingly,",
 +        "either by hand or with [TT]-p[tt]. Do not use an atom name instead!",
 +        "[PAR]Ions which can have multiple charge states get the multiplicity",
 +        "added, without sign, for the uncommon states only.[PAR]",
 +        "With the option [TT]-pot[tt] the potential can be written as B-factors",
 +        "in a [TT].pdb[tt] file (for visualisation using e.g. Rasmol).",
 +        "The unit of the potential is 1000 kJ/(mol e), the scaling be changed",
 +        "with the [TT]-scale[tt] option.[PAR]",
 +        "For larger ions, e.g. sulfate we recommended using [TT]genbox[tt]."
 +    };
 +    const char        *bugs[] = {
 +        "Calculation of the potential is not reliable, therefore the [TT]-random[tt] option is now turned on by default.",
 +        "If you specify a salt concentration existing ions are not taken into account. In effect you therefore specify the amount of salt to be added."
 +    };
 +    static int         p_num   = 0, n_num = 0, p_q = 1, n_q = -1;
 +    static const char *p_name  = "NA", *n_name = "CL";
 +    static real        rmin    = 0.6, scale = 0.001, conc = 0;
 +    static int         seed    = 1993;
 +    static gmx_bool    bRandom = TRUE, bNeutral = FALSE;
 +    static t_pargs     pa[]    = {
 +        { "-np",    FALSE, etINT,  {&p_num}, "Number of positive ions"       },
 +        { "-pname", FALSE, etSTR,  {&p_name}, "Name of the positive ion"      },
 +        { "-pq",    FALSE, etINT,  {&p_q},   "Charge of the positive ion"    },
 +        { "-nn",    FALSE, etINT,  {&n_num}, "Number of negative ions"       },
 +        { "-nname", FALSE, etSTR,  {&n_name}, "Name of the negative ion"      },
 +        { "-nq",    FALSE, etINT,  {&n_q},   "Charge of the negative ion"    },
 +        { "-rmin",  FALSE, etREAL, {&rmin},  "Minimum distance between ions" },
 +        { "-random", FALSE, etBOOL, {&bRandom}, "Use random placement of ions instead of based on potential. The rmin option should still work" },
 +        { "-seed",  FALSE, etINT,  {&seed},  "Seed for random number generator" },
 +        { "-scale", FALSE, etREAL, {&scale}, "Scaling factor for the potential for [TT]-pot[tt]" },
 +        { "-conc",  FALSE, etREAL, {&conc},
 +          "Specify salt concentration (mol/liter). This will add sufficient ions to reach up to the specified concentration as computed from the volume of the cell in the input [TT].tpr[tt] file. Overrides the [TT]-np[tt] and [TT]-nn[tt] options." },
-     if ((conc > 0) || bNeutral)
++        { "-neutral", FALSE, etBOOL, {&bNeutral}, "This option will add enough ions to neutralize the system. These ions are added on top of those specified with [TT]-np[tt]/[TT]-nn[tt] or [TT]-conc[tt]. "}
 +    };
 +    gmx_mtop_t        *mtop;
 +    gmx_localtop_t    *top;
 +    t_inputrec         inputrec;
 +    t_commrec         *cr;
 +    t_mdatoms         *mdatoms;
 +    gmx_enerdata_t     enerd;
 +    t_graph           *graph;
 +    t_forcerec        *fr;
 +    rvec              *x, *v;
 +    real              *pot, vol, qtot;
 +    matrix             box;
 +    t_atoms            atoms;
 +    t_pbc              pbc;
 +    int               *repl;
 +    atom_id           *index;
 +    char              *grpname;
 +    gmx_bool          *bSet, bPDB;
 +    int                i, nw, nwa, nsa, nsalt, iqtot;
 +    FILE              *fplog;
 +    output_env_t       oenv;
 +    t_filenm           fnm[] = {
 +        { efTPX, NULL,  NULL,      ffREAD  },
 +        { efXVG, "-table", "table", ffOPTRD },
 +        { efNDX, NULL,  NULL,      ffOPTRD },
 +        { efSTO, "-o",  NULL,      ffWRITE },
 +        { efLOG, "-g",  "genion",  ffWRITE },
 +        { efPDB, "-pot", "pot",    ffOPTWR },
 +        { efTOP, "-p",  "topol",   ffOPTRW }
 +    };
 +#define NFILE asize(fnm)
 +
 +    parse_common_args(&argc, argv, PCA_BE_NICE, NFILE, fnm, asize(pa), pa,
 +                      asize(desc), desc, asize(bugs), bugs, &oenv);
 +    bPDB = ftp2bSet(efPDB, NFILE, fnm);
 +    if (bRandom && bPDB)
 +    {
 +        fprintf(stderr, "Not computing potential with random option!\n");
 +        bPDB = FALSE;
 +    }
 +
 +    /* Check input for something sensible */
 +    if ((p_num < 0) || (n_num < 0))
 +    {
 +        gmx_fatal(FARGS, "Negative number of ions to add?");
 +    }
 +
 +    snew(mtop, 1);
 +    snew(top, 1);
 +    fplog = init_calcpot(ftp2fn(efLOG, NFILE, fnm), ftp2fn(efTPX, NFILE, fnm),
 +                         opt2fn("-table", NFILE, fnm), mtop, top, &inputrec, &cr,
 +                         &graph, &mdatoms, &fr, &enerd, &pot, box, &x, oenv);
 +
 +    atoms = gmx_mtop_global_atoms(mtop);
 +
 +    qtot = 0;
 +    for (i = 0; (i < atoms.nr); i++)
 +    {
 +        qtot += atoms.atom[i].q;
 +    }
 +    iqtot = gmx_nint(qtot);
 +
-         if (conc > 0)
++    
++    if (conc > 0)
 +    {
 +        /* Compute number of ions to be added */
 +        vol = det(box);
-             nsalt = gmx_nint(conc*vol*AVOGADRO/1e24);
-             p_num = abs(nsalt*n_q);
-             n_num = abs(nsalt*p_q);
-             if (bNeutral)
++        nsalt = gmx_nint(conc*vol*AVOGADRO/1e24);
++        p_num = abs(nsalt*n_q);
++        n_num = abs(nsalt*p_q);
++    }
++    if (bNeutral)
++    {
++        int qdelta = p_num*p_q + n_num*n_q + iqtot;
++
++        /* Check if the system is neutralizable
++         * is (qdelta == p_q*p_num + n_q*n_num) solvable for p_num and n_num? */
++        int gcd = greatest_common_divisor(n_q, p_q);
++        if ((qdelta % gcd) != 0)
++        {
++            gmx_fatal(FARGS, "Can't neutralize this system using -nq %d and"
++                    " -pq %d.\n", n_q, p_q);
++        }
++        
++        while (qdelta != 0)
 +        {
-                 int qdelta = 0;
-                 do
-                 {
-                     qdelta = (p_num*p_q + n_num*n_q + iqtot);
-                     if (qdelta < 0)
-                     {
-                         p_num  += abs(qdelta/p_q);
-                         qdelta  = (p_num*p_q + n_num*n_q + iqtot);
-                     }
-                     if (qdelta > 0)
-                     {
-                         n_num  += abs(qdelta/n_q);
-                         qdelta  = (p_num*p_q + n_num*n_q + iqtot);
-                     }
-                 }
-                 while (qdelta != 0);
++            while (qdelta < 0)
 +            {
++                p_num++;
++                qdelta += p_q;
++            }
++            while (qdelta > 0)
++            {
++                n_num++;
++                qdelta += n_q;
 +            }
 +        }
 +    }
 +
 +    if ((p_num == 0) && (n_num == 0))
 +    {
 +        if (!bPDB)
 +        {
 +            fprintf(stderr, "No ions to add and no potential to calculate.\n");
 +            exit(0);
 +        }
 +        nw  = 0;
 +        nsa = 0; /* to keep gcc happy */
 +    }
 +    else
 +    {
 +        printf("Will try to add %d %s ions and %d %s ions.\n",
 +               p_num, p_name, n_num, n_name);
 +        printf("Select a continuous group of solvent molecules\n");
 +        get_index(&atoms, ftp2fn_null(efNDX, NFILE, fnm), 1, &nwa, &index, &grpname);
 +        for (i = 1; i < nwa; i++)
 +        {
 +            if (index[i] != index[i-1]+1)
 +            {
 +                gmx_fatal(FARGS, "The solvent group %s is not continuous: "
 +                          "index[%d]=%d, index[%d]=%d",
 +                          grpname, i, index[i-1]+1, i+1, index[i]+1);
 +            }
 +        }
 +        nsa = 1;
 +        while ((nsa < nwa) &&
 +               (atoms.atom[index[nsa]].resind ==
 +                atoms.atom[index[nsa-1]].resind))
 +        {
 +            nsa++;
 +        }
 +        if (nwa % nsa)
 +        {
 +            gmx_fatal(FARGS, "Your solvent group size (%d) is not a multiple of %d",
 +                      nwa, nsa);
 +        }
 +        nw = nwa/nsa;
 +        fprintf(stderr, "Number of (%d-atomic) solvent molecules: %d\n", nsa, nw);
 +        if (p_num+n_num > nw)
 +        {
 +            gmx_fatal(FARGS, "Not enough solvent for adding ions");
 +        }
 +    }
 +
 +    if (opt2bSet("-p", NFILE, fnm))
 +    {
 +        update_topol(opt2fn("-p", NFILE, fnm), p_num, n_num, p_name, n_name, grpname);
 +    }
 +
 +    snew(bSet, nw);
 +    snew(repl, nw);
 +
 +    snew(v, atoms.nr);
 +    snew(atoms.pdbinfo, atoms.nr);
 +
 +    set_pbc(&pbc, inputrec.ePBC, box);
 +
 +    /* Now loop over the ions that have to be placed */
 +    do
 +    {
 +        if (!bRandom)
 +        {
 +            calc_pot(fplog, cr, mtop, &inputrec, top, x, fr, &enerd, mdatoms, pot, box, graph);
 +            if (bPDB || debug)
 +            {
 +                char buf[STRLEN];
 +
 +                if (debug)
 +                {
 +                    sprintf(buf, "%d_%s", p_num+n_num, ftp2fn(efPDB, NFILE, fnm));
 +                }
 +                else
 +                {
 +                    strcpy(buf, ftp2fn(efPDB, NFILE, fnm));
 +                }
 +                for (i = 0; (i < atoms.nr); i++)
 +                {
 +                    atoms.pdbinfo[i].bfac = pot[i]*scale;
 +                }
 +                write_sto_conf(buf, "Potential calculated by genion",
 +                               &atoms, x, v, inputrec.ePBC, box);
 +                bPDB = FALSE;
 +            }
 +        }
 +        if ((p_num > 0) && (p_num >= n_num))
 +        {
 +            insert_ion(nsa, &nw, bSet, repl, index, pot, x, &pbc,
 +                       1, p_q, p_name, mdatoms, rmin, bRandom, &seed);
 +            p_num--;
 +        }
 +        else if (n_num > 0)
 +        {
 +            insert_ion(nsa, &nw, bSet, repl, index, pot, x, &pbc,
 +                       -1, n_q, n_name, mdatoms, rmin, bRandom, &seed);
 +            n_num--;
 +        }
 +    }
 +    while (p_num+n_num > 0);
 +    fprintf(stderr, "\n");
 +
 +    if (nw)
 +    {
 +        sort_ions(nsa, nw, repl, index, &atoms, x, p_name, n_name);
 +    }
 +
 +    sfree(atoms.pdbinfo);
 +    atoms.pdbinfo = NULL;
 +    write_sto_conf(ftp2fn(efSTO, NFILE, fnm), *mtop->name, &atoms, x, NULL,
 +                   inputrec.ePBC, box);
 +
 +    thanx(stderr);
 +
 +    gmx_log_close(fplog);
 +
 +    return 0;
 +}
index 15b8c561bfe5b91ef31cf4e4dd9dfba33a5b07c3,0000000000000000000000000000000000000000..93a2e2718157133a09065f6a6da3eb32ec08993c
mode 100644,000000..100644
--- /dev/null
@@@ -1,1069 -1,0 +1,1172 @@@
-     "AuthenticAMD"
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + * This file is part of GROMACS.
 + * Copyright (c) 2012-
 + *
 + * Written by the Gromacs development team under coordination of
 + * David van der Spoel, Berk Hess, and Erik Lindahl.
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + *
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef HAVE_SCHED_H
 +#define _GNU_SOURCE
 +#include <sched.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include <ctype.h>
 +#ifdef _MSC_VER
 +/* MSVC definition for __cpuid() */
 +#include <intrin.h>
 +/* sysinfo functions */
 +#include <windows.h>
 +#endif
 +#ifdef HAVE_UNISTD_H
 +/* sysconf() definition */
 +#include <unistd.h>
 +#endif
 +
 +#include "gmx_cpuid.h"
 +
 +
 +
 +/* For convenience, and to enable configure-time invocation, we keep all architectures
 + * in a single file, but to avoid repeated ifdefs we set the overall architecture here.
 + */
 +#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
++/* OK, it is x86, but can we execute cpuid? */
++#if defined(GMX_X86_GCC_INLINE_ASM) || ( defined(_MSC_VER) && ( (_MSC_VER > 1500) || (_MSC_VER==1500 & _MSC_FULL_VER >= 150030729)))
 +#    define GMX_CPUID_X86
 +#endif
++#endif
 +
 +/* Global constant character strings corresponding to our enumerated types */
 +const char *
 +gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS] =
 +{
 +    "CannotDetect",
 +    "Unknown",
 +    "GenuineIntel",
-     "AVX_256"
++    "AuthenticAMD",
++    "Fujitsu",
++    "IBM"
 +};
 +
 +const char *
 +gmx_cpuid_feature_string[GMX_CPUID_NFEATURES] =
 +{
 +    "CannotDetect",
 +    "aes",
 +    "apic",
 +    "avx",
 +    "avx2",
 +    "clfsh",
 +    "cmov",
 +    "cx8",
 +    "cx16",
 +    "f16c",
 +    "fma",
 +    "fma4",
 +    "htt",
 +    "lahf_lm",
 +    "misalignsse",
 +    "mmx",
 +    "msr",
 +    "nonstop_tsc",
 +    "pcid",
 +    "pclmuldq",
 +    "pdcm",
 +    "pdpe1gb",
 +    "popcnt",
 +    "pse",
 +    "rdrnd",
 +    "rdtscp",
 +    "sse2",
 +    "sse3",
 +    "sse4a",
 +    "sse4.1",
 +    "sse4.2",
 +    "ssse3",
 +    "tdt",
 +    "x2apic",
 +    "xop"
 +};
 +
 +const char *
 +gmx_cpuid_acceleration_string[GMX_CPUID_NACCELERATIONS] =
 +{
 +    "CannotDetect",
 +    "None",
 +    "SSE2",
 +    "SSE4.1",
 +    "AVX_128_FMA",
-             /* Could not find vendor */
-             strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_BRAND_MAXLEN);
++    "AVX_256",
++    "Sparc64 HPC-ACE"
 +};
 +
 +/* Max length of brand string */
 +#define GMX_CPUID_BRAND_MAXLEN 256
 +
 +
 +/* Contents of the abstract datatype */
 +struct gmx_cpuid
 +{
 +    enum gmx_cpuid_vendor      vendor;
 +    char                       brand[GMX_CPUID_BRAND_MAXLEN];
 +    int                        family;
 +    int                        model;
 +    int                        stepping;
 +    /* Not using gmx_bool here, since this file must be possible to compile without simple.h */
 +    char                       feature[GMX_CPUID_NFEATURES];
 +
 +    /* Basic CPU topology information. For x86 this is a bit complicated since the topology differs between
 +     * operating systems and sometimes even settings. For most other architectures you can likely just check
 +     * the documentation and then write static information to these arrays rather than detecting on-the-fly.
 +     */
 +    int                        have_cpu_topology;
 +    int                        nproc;               /* total number of logical processors from OS */
 +    int                        npackages;
 +    int                        ncores_per_package;
 +    int                        nhwthreads_per_core;
 +    int *                      package_id;
 +    int *                      core_id;             /* Local core id in each package */
 +    int *                      hwthread_id;         /* Local hwthread id in each core */
 +    int *                      locality_order;      /* Processor indices sorted in locality order */
 +};
 +
 +
 +/* Simple routines to access the data structure. The initialization routine is
 + * further down since that needs to call other static routines in this file.
 + */
 +enum gmx_cpuid_vendor
 +gmx_cpuid_vendor            (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->vendor;
 +}
 +
 +
 +const char *
 +gmx_cpuid_brand             (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->brand;
 +}
 +
 +int
 +gmx_cpuid_family            (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->family;
 +}
 +
 +int
 +gmx_cpuid_model             (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->model;
 +}
 +
 +int
 +gmx_cpuid_stepping          (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->stepping;
 +}
 +
 +int
 +gmx_cpuid_feature           (gmx_cpuid_t                cpuid,
 +                             enum gmx_cpuid_feature     feature)
 +{
 +    return (cpuid->feature[feature] != 0);
 +}
 +
 +
 +
 +
 +/* What type of acceleration was compiled in, if any?
 + * This is set from Cmake. Note that the SSE2 and SSE4_1 macros are set for
 + * AVX too, so it is important that they appear last in the list.
 + */
 +#ifdef GMX_X86_AVX_256
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_X86_AVX_256;
 +#elif defined GMX_X86_AVX_128_FMA
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_X86_AVX_128_FMA;
 +#elif defined GMX_X86_SSE4_1
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
 +#elif defined GMX_X86_SSE2
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE2;
++#elif defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE
++static const
++enum gmx_cpuid_acceleration
++    compiled_acc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE;
 +#else
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_NONE;
 +#endif
 +
 +
 +#ifdef GMX_CPUID_X86
 +
 +/* Execute CPUID on x86 class CPUs. level sets function to exec, and the
 + * contents of register output is returned. See Intel/AMD docs for details.
 + *
 + * This version supports extended information where we can also have an input
 + * value in the ecx register. This is ignored for most levels, but some of them
 + * (e.g. level 0xB on Intel) use it.
 + */
 +static int
 +execute_x86cpuid(unsigned int   level,
 +                 unsigned int   ecxval,
 +                 unsigned int * eax,
 +                 unsigned int * ebx,
 +                 unsigned int * ecx,
 +                 unsigned int * edx)
 +{
 +    int rc = 0;
 +
 +    /* Currently CPUID is only supported (1) if we can use an instruction on MSVC, or (2)
 +     * if the compiler handles GNU-style inline assembly.
 +     */
 +
 +#if (defined _MSC_VER)
 +    int CPUInfo[4];
 +
 +#if (_MSC_VER > 1500) || (_MSC_VER == 1500 & _MSC_FULL_VER >= 150030729)
 +    /* MSVC 9.0 SP1 or later */
 +    __cpuidex(CPUInfo, level, ecxval);
 +    rc = 0;
 +#else
 +    __cpuid(CPUInfo, level);
 +    /* Set an error code if the user wanted a non-zero ecxval, since we did not have cpuidex */
 +    rc = (ecxval > 0) ? -1 : 0;
 +#endif
 +    *eax = CPUInfo[0];
 +    *ebx = CPUInfo[1];
 +    *ecx = CPUInfo[2];
 +    *edx = CPUInfo[3];
 +
 +#elif (defined GMX_X86_GCC_INLINE_ASM)
 +    /* for now this means GMX_X86_GCC_INLINE_ASM should be defined,
 +     * but there might be more options added in the future.
 +     */
 +    *eax = level;
 +    *ecx = ecxval;
 +    *ebx = 0;
 +    *edx = 0;
 +#if defined(__i386__) && defined(__PIC__)
 +    /* Avoid clobbering the global offset table in 32-bit pic code (ebx register) */
 +    __asm__ __volatile__ ("xchgl %%ebx, %1  \n\t"
 +                          "cpuid            \n\t"
 +                          "xchgl %%ebx, %1  \n\t"
 +                          : "+a" (*eax), "+r" (*ebx), "+c" (*ecx), "+d" (*edx));
 +#else
 +    /* i386 without PIC, or x86-64. Things are easy and we can clobber any reg we want :-) */
 +    __asm__ __volatile__ ("cpuid            \n\t"
 +                          : "+a" (*eax), "+b" (*ebx), "+c" (*ecx), "+d" (*edx));
 +#endif
 +    rc = 0;
 +#else
 +    /* Death and horror!
 +     * Apparently this is an x86 platform where we don't know how to call cpuid.
 +     *
 +     * This is REALLY bad, since we will lose all Gromacs acceleration.
 +     */
 +    *eax = 0;
 +    *ebx = 0;
 +    *ecx = 0;
 +    *edx = 0;
 +
 +    rc = -1;
 +#endif
 +    return rc;
 +}
 +
 +
 +/* Identify CPU features common to Intel & AMD - mainly brand string,
 + * version and some features. Vendor has already been detected outside this.
 + */
 +static int
 +cpuid_check_common_x86(gmx_cpuid_t                cpuid)
 +{
 +    int                       fn, max_stdfn, max_extfn;
 +    unsigned int              eax, ebx, ecx, edx;
 +    char                      str[GMX_CPUID_BRAND_MAXLEN];
 +    char *                    p;
 +
 +    /* Find largest standard/extended function input value */
 +    execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx);
 +    max_stdfn = eax;
 +    execute_x86cpuid(0x80000000, 0, &eax, &ebx, &ecx, &edx);
 +    max_extfn = eax;
 +
 +    p = str;
 +    if (max_extfn >= 0x80000005)
 +    {
 +        /* Get CPU brand string */
 +        for (fn = 0x80000002; fn < 0x80000005; fn++)
 +        {
 +            execute_x86cpuid(fn, 0, &eax, &ebx, &ecx, &edx);
 +            memcpy(p, &eax, 4);
 +            memcpy(p+4, &ebx, 4);
 +            memcpy(p+8, &ecx, 4);
 +            memcpy(p+12, &edx, 4);
 +            p += 16;
 +        }
 +        *p = '\0';
 +
 +        /* Remove empty initial space */
 +        p = str;
 +        while (isspace(*(p)))
 +        {
 +            p++;
 +        }
 +        strncpy(cpuid->brand, p, GMX_CPUID_BRAND_MAXLEN);
 +    }
 +    else
 +    {
 +        strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_BRAND_MAXLEN);
 +    }
 +
 +    /* Find basic CPU properties */
 +    if (max_stdfn >= 1)
 +    {
 +        execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +
 +        cpuid->family   = ((eax & 0x0FF00000) >> 20) + ((eax & 0x00000F00) >> 8);
 +        /* Note that extended model should be shifted left 4, so only shift right 12 iso 16. */
 +        cpuid->model    = ((eax & 0x000F0000) >> 12) + ((eax & 0x000000F0) >> 4);
 +        cpuid->stepping = (eax & 0x0000000F);
 +
 +        /* Feature flags common to AMD and intel */
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE3]     = (ecx & (1 << 0))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PCLMULDQ] = (ecx & (1 << 1))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSSE3]    = (ecx & (1 << 9))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_FMA]      = (ecx & (1 << 12)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_CX16]     = (ecx & (1 << 13)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4_1]   = (ecx & (1 << 19)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4_2]   = (ecx & (1 << 20)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_POPCNT]   = (ecx & (1 << 23)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_AES]      = (ecx & (1 << 25)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_AVX]      = (ecx & (1 << 28)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_F16C]     = (ecx & (1 << 29)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_RDRND]    = (ecx & (1 << 30)) != 0;
 +
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PSE]      = (edx & (1 << 3))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_MSR]      = (edx & (1 << 5))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_CX8]      = (edx & (1 << 8))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_APIC]     = (edx & (1 << 9))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_CMOV]     = (edx & (1 << 15)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_CLFSH]    = (edx & (1 << 19)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_MMX]      = (edx & (1 << 23)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE2]     = (edx & (1 << 26)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_HTT]      = (edx & (1 << 28)) != 0;
 +    }
 +    else
 +    {
 +        cpuid->family   = -1;
 +        cpuid->model    = -1;
 +        cpuid->stepping = -1;
 +    }
 +
 +    if (max_extfn >= 0x80000001)
 +    {
 +        execute_x86cpuid(0x80000001, 0, &eax, &ebx, &ecx, &edx);
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_LAHF_LM] = (ecx & (1 << 0))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PDPE1GB] = (edx & (1 << 26)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_RDTSCP]  = (edx & (1 << 27)) != 0;
 +    }
 +
 +    if (max_extfn >= 0x80000007)
 +    {
 +        execute_x86cpuid(0x80000007, 0, &eax, &ebx, &ecx, &edx);
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_NONSTOP_TSC]  = (edx & (1 << 8))  != 0;
 +    }
 +    return 0;
 +}
 +
 +/* This routine returns the number of unique different elements found in the array,
 + * and renumbers these starting from 0. For example, the array {0,1,2,8,9,10,8,9,10,0,1,2}
 + * will be rewritten to {0,1,2,3,4,5,3,4,5,0,1,2}, and it returns 6 for the
 + * number of unique elements.
 + */
 +static int
 +cpuid_renumber_elements(int *data, int n)
 +{
 +    int *unique;
 +    int  i, j, nunique, found;
 +
 +    unique = malloc(sizeof(int)*n);
 +
 +    nunique = 0;
 +    for (i = 0; i < n; i++)
 +    {
 +        for (j = 0, found = 0; j < nunique && !found; j++)
 +        {
 +            found = (data[i] == unique[j]);
 +        }
 +        if (!found)
 +        {
 +            /* Insert in sorted order! */
 +            for (j = nunique++; j > 0 && unique[j-1] > data[i]; j--)
 +            {
 +                unique[j] = unique[j-1];
 +            }
 +            unique[j] = data[i];
 +        }
 +    }
 +    /* renumber */
 +    for (i = 0; i < n; i++)
 +    {
 +        for (j = 0; j < nunique; j++)
 +        {
 +            if (data[i] == unique[j])
 +            {
 +                data[i] = j;
 +            }
 +        }
 +    }
 +    return nunique;
 +}
 +
 +/* APIC IDs, or everything you wanted to know about your x86 cores but were afraid to ask...
 + *
 + * Raw APIC IDs are unfortunately somewhat dirty. For technical reasons they are assigned
 + * in power-of-2 chunks, and even then there are no guarantees about specific numbers - all
 + * we know is that the part for each thread/core/package is unique, and how many bits are
 + * reserved for that part.
 + * This routine does internal renumbering so we get continuous indices, and also
 + * decodes the actual number of packages,cores-per-package and hwthreads-per-core.
 + */
 +static void
 +cpuid_x86_decode_apic_id(gmx_cpuid_t cpuid, int *apic_id, int core_bits, int hwthread_bits)
 +{
 +    int i, idx;
 +    int hwthread_mask, core_mask_after_shift;
 +
 +    cpuid->hwthread_id     = malloc(sizeof(int)*cpuid->nproc);
 +    cpuid->core_id         = malloc(sizeof(int)*cpuid->nproc);
 +    cpuid->package_id      = malloc(sizeof(int)*cpuid->nproc);
 +    cpuid->locality_order  = malloc(sizeof(int)*cpuid->nproc);
 +
 +    hwthread_mask         = (1 << hwthread_bits) - 1;
 +    core_mask_after_shift = (1 << core_bits) - 1;
 +
 +    for (i = 0; i < cpuid->nproc; i++)
 +    {
 +        cpuid->hwthread_id[i] = apic_id[i] & hwthread_mask;
 +        cpuid->core_id[i]     = (apic_id[i] >> hwthread_bits) & core_mask_after_shift;
 +        cpuid->package_id[i]  = apic_id[i] >> (core_bits + hwthread_bits);
 +    }
 +
 +    cpuid->npackages            = cpuid_renumber_elements(cpuid->package_id, cpuid->nproc);
 +    cpuid->ncores_per_package   = cpuid_renumber_elements(cpuid->core_id, cpuid->nproc);
 +    cpuid->nhwthreads_per_core  = cpuid_renumber_elements(cpuid->hwthread_id, cpuid->nproc);
 +
 +    /* Create a locality order array, i.e. first all resources in package0, which in turn
 +     * are sorted so we first have all resources in core0, where threads are sorted in order, etc.
 +     */
 +    for (i = 0; i < cpuid->nproc; i++)
 +    {
 +        idx = (cpuid->package_id[i]*cpuid->ncores_per_package + cpuid->core_id[i])*cpuid->nhwthreads_per_core + cpuid->hwthread_id[i];
 +        cpuid->locality_order[idx] = i;
 +    }
 +}
 +
 +
 +/* Detection of AMD-specific CPU features */
 +static int
 +cpuid_check_amd_x86(gmx_cpuid_t                cpuid)
 +{
 +    int                       max_stdfn, max_extfn;
 +    unsigned int              eax, ebx, ecx, edx;
 +    int                       hwthread_bits, core_bits;
 +    int *                     apic_id;
 +
 +    cpuid_check_common_x86(cpuid);
 +
 +    execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx);
 +    max_stdfn = eax;
 +
 +    execute_x86cpuid(0x80000000, 0, &eax, &ebx, &ecx, &edx);
 +    max_extfn = eax;
 +
 +    if (max_extfn >= 0x80000001)
 +    {
 +        execute_x86cpuid(0x80000001, 0, &eax, &ebx, &ecx, &edx);
 +
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4A]       = (ecx & (1 << 6))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_MISALIGNSSE] = (ecx & (1 << 7))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_XOP]         = (ecx & (1 << 11)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_FMA4]        = (ecx & (1 << 16)) != 0;
 +    }
 +
 +    /* Query APIC information on AMD */
 +    if (max_extfn >= 0x80000008)
 +    {
 +#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
 +        /* Linux */
 +        unsigned int   i;
 +        cpu_set_t      cpuset, save_cpuset;
 +        cpuid->nproc = sysconf(_SC_NPROCESSORS_ONLN);
 +        apic_id      = malloc(sizeof(int)*cpuid->nproc);
 +        sched_getaffinity(0, sizeof(cpu_set_t), &save_cpuset);
 +        /* Get APIC id from each core */
 +        CPU_ZERO(&cpuset);
 +        for (i = 0; i < cpuid->nproc; i++)
 +        {
 +            CPU_SET(i, &cpuset);
 +            sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
 +            execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +            apic_id[i] = ebx >> 24;
 +            CPU_CLR(i, &cpuset);
 +        }
 +        /* Reset affinity to the value it had when calling this routine */
 +        sched_setaffinity(0, sizeof(cpu_set_t), &save_cpuset);
 +#define CPUID_HAVE_APIC
 +#elif defined GMX_NATIVE_WINDOWS
 +        /* Windows */
 +        DWORD_PTR     i;
 +        SYSTEM_INFO   sysinfo;
 +        unsigned int  save_affinity, affinity;
 +        GetSystemInfo( &sysinfo );
 +        cpuid->nproc  = sysinfo.dwNumberOfProcessors;
 +        apic_id       = malloc(sizeof(int)*cpuid->nproc);
 +        /* Get previous affinity mask */
 +        save_affinity = SetThreadAffinityMask(GetCurrentThread(), 1);
 +        for (i = 0; i < cpuid->nproc; i++)
 +        {
 +            SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR)1)<<i));
 +            Sleep(0);
 +            execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +            apic_id[i] = ebx >> 24;
 +        }
 +        SetThreadAffinityMask(GetCurrentThread(), save_affinity);
 +#define CPUID_HAVE_APIC
 +#endif
 +#ifdef CPUID_HAVE_APIC
 +        /* AMD does not support SMT yet - there are no hwthread bits in apic ID */
 +        hwthread_bits = 0;
 +        /* Get number of core bits in apic ID - try modern extended method first */
 +        execute_x86cpuid(0x80000008, 0, &eax, &ebx, &ecx, &edx);
 +        core_bits = (ecx >> 12) & 0xf;
 +        if (core_bits == 0)
 +        {
 +            /* Legacy method for old single/dual core AMD CPUs */
 +            int i = ecx & 0xF;
 +            for (core_bits = 0; (i>>core_bits) > 0; core_bits++)
 +            {
 +                ;
 +            }
 +        }
 +        cpuid_x86_decode_apic_id(cpuid, apic_id, core_bits, hwthread_bits);
 +        cpuid->have_cpu_topology = 1;
 +#endif
 +    }
 +    return 0;
 +}
 +
 +/* Detection of Intel-specific CPU features */
 +static int
 +cpuid_check_intel_x86(gmx_cpuid_t                cpuid)
 +{
 +    unsigned int              max_stdfn, max_extfn;
 +    unsigned int              eax, ebx, ecx, edx;
 +    unsigned int              max_logical_cores, max_physical_cores;
 +    int                       hwthread_bits, core_bits;
 +    int *                     apic_id;
 +
 +    cpuid_check_common_x86(cpuid);
 +
 +    execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx);
 +    max_stdfn = eax;
 +
 +    execute_x86cpuid(0x80000000, 0, &eax, &ebx, &ecx, &edx);
 +    max_extfn = eax;
 +
 +    if (max_stdfn >= 1)
 +    {
 +        execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PDCM]    = (ecx & (1 << 15)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PCID]    = (ecx & (1 << 17)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_X2APIC]  = (ecx & (1 << 21)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_TDT]     = (ecx & (1 << 24)) != 0;
 +    }
 +
 +    if (max_stdfn >= 7)
 +    {
 +        execute_x86cpuid(0x7, 0, &eax, &ebx, &ecx, &edx);
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_AVX2]    = (ebx & (1 << 5))  != 0;
 +    }
 +
 +    /* Check whether Hyper-Threading is enabled, not only supported */
 +    if (cpuid->feature[GMX_CPUID_FEATURE_X86_HTT] && max_stdfn >= 4)
 +    {
 +        execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +        max_logical_cores  = (ebx >> 16) & 0x0FF;
 +        execute_x86cpuid(0x4, 0, &eax, &ebx, &ecx, &edx);
 +        max_physical_cores = ((eax >> 26) & 0x3F) + 1;
 +
 +        /* Clear HTT flag if we only have 1 logical core per physical */
 +        if (max_logical_cores/max_physical_cores < 2)
 +        {
 +            cpuid->feature[GMX_CPUID_FEATURE_X86_HTT] = 0;
 +        }
 +    }
 +
 +    if (max_stdfn >= 0xB)
 +    {
 +        /* Query x2 APIC information from cores */
 +#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
 +        /* Linux */
 +        unsigned int   i;
 +        cpu_set_t      cpuset, save_cpuset;
 +        cpuid->nproc = sysconf(_SC_NPROCESSORS_ONLN);
 +        apic_id      = malloc(sizeof(int)*cpuid->nproc);
 +        sched_getaffinity(0, sizeof(cpu_set_t), &save_cpuset);
 +        /* Get x2APIC ID from each hardware thread */
 +        CPU_ZERO(&cpuset);
 +        for (i = 0; i < cpuid->nproc; i++)
 +        {
 +            CPU_SET(i, &cpuset);
 +            sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
 +            execute_x86cpuid(0xB, 0, &eax, &ebx, &ecx, &edx);
 +            apic_id[i] = edx;
 +            CPU_CLR(i, &cpuset);
 +        }
 +        /* Reset affinity to the value it had when calling this routine */
 +        sched_setaffinity(0, sizeof(cpu_set_t), &save_cpuset);
 +#define CPUID_HAVE_APIC
 +#elif defined GMX_NATIVE_WINDOWS
 +        /* Windows */
 +        DWORD_PTR     i;
 +        SYSTEM_INFO   sysinfo;
 +        unsigned int  save_affinity, affinity;
 +        GetSystemInfo( &sysinfo );
 +        cpuid->nproc  = sysinfo.dwNumberOfProcessors;
 +        apic_id       = malloc(sizeof(int)*cpuid->nproc);
 +        /* Get previous affinity mask */
 +        save_affinity = SetThreadAffinityMask(GetCurrentThread(), 1);
 +        for (i = 0; i < cpuid->nproc; i++)
 +        {
 +            SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR)1)<<i));
 +            Sleep(0);
 +            execute_x86cpuid(0xB, 0, &eax, &ebx, &ecx, &edx);
 +            apic_id[i] = edx;
 +        }
 +        SetThreadAffinityMask(GetCurrentThread(), save_affinity);
 +#define CPUID_HAVE_APIC
 +#endif
 +#ifdef CPUID_HAVE_APIC
 +        execute_x86cpuid(0xB, 0, &eax, &ebx, &ecx, &edx);
 +        hwthread_bits    = eax & 0x1F;
 +        execute_x86cpuid(0xB, 1, &eax, &ebx, &ecx, &edx);
 +        core_bits        = (eax & 0x1F) - hwthread_bits;
 +        cpuid_x86_decode_apic_id(cpuid, apic_id, core_bits, hwthread_bits);
 +        cpuid->have_cpu_topology = 1;
 +#endif
 +    }
 +    return 0;
 +}
 +#endif /* GMX_CPUID_X86 */
 +
 +
 +
++
++static void
++chomp_substring_before_colon(const char *in, char *s, int maxlength)
++{
++    char *p;
++    strncpy(s,in,maxlength);
++    p = strchr(s,':');
++    if(p!=NULL)
++    {
++        *p='\0';
++        while(isspace(*(--p)) && (p>=s))
++        {
++            *p='\0';
++        }
++    }
++    else
++    {
++        *s='\0';
++    }
++}
++
++static void
++chomp_substring_after_colon(const char *in, char *s, int maxlength)
++{
++    char *p;
++    if( (p = strchr(in,':'))!=NULL)
++    {
++        p++;
++        while(isspace(*p)) p++;
++        strncpy(s,p,maxlength);
++        p = s+strlen(s);
++        while(isspace(*(--p)) && (p>=s))
++        {
++            *p='\0';
++        }
++    }
++    else
++    {
++        *s='\0';
++    }
++}
++
 +/* Try to find the vendor of the current CPU, so we know what specific
 + * detection routine to call.
 + */
 +static enum gmx_cpuid_vendor
 +cpuid_check_vendor(void)
 +{
 +    enum gmx_cpuid_vendor      i, vendor;
 +    /* Register data used on x86 */
 +    unsigned int               eax, ebx, ecx, edx;
 +    char                       vendorstring[13];
++    FILE *                     fp;
++    char                       buffer[255],buffer2[255];
 +
 +    /* Set default first */
 +    vendor = GMX_CPUID_VENDOR_UNKNOWN;
 +
 +#ifdef GMX_CPUID_X86
 +    execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx);
 +
 +    memcpy(vendorstring, &ebx, 4);
 +    memcpy(vendorstring+4, &edx, 4);
 +    memcpy(vendorstring+8, &ecx, 4);
 +
 +    vendorstring[12] = '\0';
 +
 +    for (i = GMX_CPUID_VENDOR_UNKNOWN; i < GMX_CPUID_NVENDORS; i++)
 +    {
 +        if (!strncmp(vendorstring, gmx_cpuid_vendor_string[i], 12))
 +        {
 +            vendor = i;
 +        }
 +    }
++#elif defined(__linux__) || defined(__linux)
++    /* General Linux. Try to get CPU vendor from /proc/cpuinfo */
++    if( (fp = fopen("/proc/cpuinfo","r")) != NULL)
++    {
++        while( (vendor == GMX_CPUID_VENDOR_UNKNOWN) && (fgets(buffer,sizeof(buffer),fp) != NULL))
++        {
++            chomp_substring_before_colon(buffer,buffer2,sizeof(buffer2));
++            /* Intel/AMD use "vendor_id", IBM "vendor". Fujitsu "manufacture". Add others if you have them! */
++            if( !strcmp(buffer2,"vendor_id") || !strcmp(buffer2,"vendor") || !strcmp(buffer2,"manufacture") )
++            {
++                chomp_substring_after_colon(buffer,buffer2,sizeof(buffer2));
++                for(i=GMX_CPUID_VENDOR_UNKNOWN; i<GMX_CPUID_NVENDORS; i++)
++                {
++                    /* Be liberal and accept if we find the vendor anywhere in string */
++                    if(strstr(buffer2,gmx_cpuid_vendor_string[i]))
++                    {
++                        vendor = i;
++                    }
++                }
++            }
++        }
++    }
++    fclose(fp);
 +#else
 +    vendor = GMX_CPUID_VENDOR_UNKNOWN;
 +#endif
 +
 +    return vendor;
 +}
 +
 +
 +
 +int
 +gmx_cpuid_topology(gmx_cpuid_t        cpuid,
 +                   int *              nprocessors,
 +                   int *              npackages,
 +                   int *              ncores_per_package,
 +                   int *              nhwthreads_per_core,
 +                   const int **       package_id,
 +                   const int **       core_id,
 +                   const int **       hwthread_id,
 +                   const int **       locality_order)
 +{
 +    int rc;
 +
 +    if (cpuid->have_cpu_topology)
 +    {
 +        *nprocessors          = cpuid->nproc;
 +        *npackages            = cpuid->npackages;
 +        *ncores_per_package   = cpuid->ncores_per_package;
 +        *nhwthreads_per_core  = cpuid->nhwthreads_per_core;
 +        *package_id           = cpuid->package_id;
 +        *core_id              = cpuid->core_id;
 +        *hwthread_id          = cpuid->hwthread_id;
 +        *locality_order       = cpuid->locality_order;
 +        rc                    = 0;
 +    }
 +    else
 +    {
 +        rc = -1;
 +    }
 +    return rc;
 +}
 +
 +
 +enum gmx_cpuid_x86_smt
 +gmx_cpuid_x86_smt(gmx_cpuid_t cpuid)
 +{
 +    enum gmx_cpuid_x86_smt rc;
 +
 +    if (cpuid->have_cpu_topology)
 +    {
 +        rc = (cpuid->nhwthreads_per_core > 1) ? GMX_CPUID_X86_SMT_ENABLED : GMX_CPUID_X86_SMT_DISABLED;
 +    }
 +    else if (cpuid->vendor == GMX_CPUID_VENDOR_AMD || gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_HTT) == 0)
 +    {
 +        rc = GMX_CPUID_X86_SMT_DISABLED;
 +    }
 +    else
 +    {
 +        rc = GMX_CPUID_X86_SMT_CANNOTDETECT;
 +    }
 +    return rc;
 +}
 +
 +
 +int
 +gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
 +{
 +    gmx_cpuid_t cpuid;
 +    int         i;
++    FILE *      fp;
++    char        buffer[255],buffer2[255];
++    int         found_brand;
 +
 +    cpuid = malloc(sizeof(*cpuid));
 +
 +    *pcpuid = cpuid;
 +
 +    for (i = 0; i < GMX_CPUID_NFEATURES; i++)
 +    {
 +        cpuid->feature[i] = 0;
 +    }
++
 +    cpuid->have_cpu_topology   = 0;
 +    cpuid->nproc               = 0;
 +    cpuid->npackages           = 0;
 +    cpuid->ncores_per_package  = 0;
 +    cpuid->nhwthreads_per_core = 0;
 +    cpuid->package_id          = NULL;
 +    cpuid->core_id             = NULL;
 +    cpuid->hwthread_id         = NULL;
 +    cpuid->locality_order      = NULL;
 +
 +    cpuid->vendor = cpuid_check_vendor();
 +
 +    switch (cpuid->vendor)
 +    {
 +#ifdef GMX_CPUID_X86
 +        case GMX_CPUID_VENDOR_INTEL:
 +            cpuid_check_intel_x86(cpuid);
 +            break;
 +        case GMX_CPUID_VENDOR_AMD:
 +            cpuid_check_amd_x86(cpuid);
 +            break;
 +#endif
 +        default:
-             for (i = 0; i < GMX_CPUID_NFEATURES; i++)
++            /* Default value */
++            strncpy(cpuid->brand,"Unknown CPU brand",GMX_CPUID_BRAND_MAXLEN);
++#if defined(__linux__) || defined(__linux)
++            /* General Linux. Try to get CPU type from /proc/cpuinfo */
++            if( (fp = fopen("/proc/cpuinfo","r")) != NULL)
++            {
++                found_brand = 0;
++                while( (found_brand==0) && (fgets(buffer,sizeof(buffer),fp) !=NULL))
++                {
++                    chomp_substring_before_colon(buffer,buffer2,sizeof(buffer2));
++                    /* Intel uses "model name", Fujitsu and IBM "cpu". */
++                    if( !strcmp(buffer2,"model name") || !strcmp(buffer2,"cpu"))
++                    {
++                        chomp_substring_after_colon(buffer,cpuid->brand,GMX_CPUID_BRAND_MAXLEN);
++                        found_brand = 1;
++                    }
++                }
++            }
++            fclose(fp);
++#endif
 +            cpuid->family         = 0;
 +            cpuid->model          = 0;
 +            cpuid->stepping       = 0;
-                 cpuid->feature[i] = 0;
++            
++            for(i=0; i<GMX_CPUID_NFEATURES; i++)
 +            {
++                cpuid->feature[i]=0;
 +            }
 +            cpuid->feature[GMX_CPUID_FEATURE_CANNOTDETECT] = 1;
 +            break;
 +    }
 +    return 0;
 +}
 +
 +
 +
 +void
 +gmx_cpuid_done               (gmx_cpuid_t              cpuid)
 +{
 +    free(cpuid);
 +}
 +
 +
 +int
 +gmx_cpuid_formatstring       (gmx_cpuid_t              cpuid,
 +                              char *                   str,
 +                              int                      n)
 +{
 +    int                     c;
 +    int                     i;
 +    enum gmx_cpuid_feature  feature;
 +
 +#ifdef _MSC_VER
 +    _snprintf(str, n,
 +              "Vendor: %s\n"
 +              "Brand:  %s\n"
 +              "Family: %2d  Model: %2d  Stepping: %2d\n"
 +              "Features:",
 +              gmx_cpuid_vendor_string[gmx_cpuid_vendor(cpuid)],
 +              gmx_cpuid_brand(cpuid),
 +              gmx_cpuid_family(cpuid), gmx_cpuid_model(cpuid), gmx_cpuid_stepping(cpuid));
 +#else
 +    snprintf(str, n,
 +             "Vendor: %s\n"
 +             "Brand:  %s\n"
 +             "Family: %2d  Model: %2d  Stepping: %2d\n"
 +             "Features:",
 +             gmx_cpuid_vendor_string[gmx_cpuid_vendor(cpuid)],
 +             gmx_cpuid_brand(cpuid),
 +             gmx_cpuid_family(cpuid), gmx_cpuid_model(cpuid), gmx_cpuid_stepping(cpuid));
 +#endif
 +
 +    str[n-1] = '\0';
 +    c        = strlen(str);
 +    n       -= c;
 +    str     += c;
 +
 +    for (feature = GMX_CPUID_FEATURE_CANNOTDETECT; feature < GMX_CPUID_NFEATURES; feature++)
 +    {
 +        if (gmx_cpuid_feature(cpuid, feature) == 1)
 +        {
 +#ifdef _MSC_VER
 +            _snprintf(str, n, " %s", gmx_cpuid_feature_string[feature]);
 +#else
 +            snprintf(str, n, " %s", gmx_cpuid_feature_string[feature]);
 +#endif
 +            str[n-1] = '\0';
 +            c        = strlen(str);
 +            n       -= c;
 +            str     += c;
 +        }
 +    }
 +#ifdef _MSC_VER
 +    _snprintf(str, n, "\n");
 +#else
 +    snprintf(str, n, "\n");
 +#endif
 +    str[n-1] = '\0';
 +
 +    return 0;
 +}
 +
 +
 +
 +enum gmx_cpuid_acceleration
 +gmx_cpuid_acceleration_suggest  (gmx_cpuid_t                 cpuid)
 +{
 +    enum gmx_cpuid_acceleration  tmpacc;
 +
 +    tmpacc = GMX_CPUID_ACCELERATION_NONE;
 +
 +    if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_INTEL)
 +    {
 +        if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_AVX))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_AVX_256;
 +        }
 +        else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE4_1))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
 +        }
 +        else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE2))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2;
 +        }
 +    }
 +    else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_AMD)
 +    {
 +        if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_AVX))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_AVX_128_FMA;
 +        }
 +        else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE4_1))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
 +        }
 +        else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE2))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2;
 +        }
 +    }
++    else if(gmx_cpuid_vendor(cpuid)==GMX_CPUID_VENDOR_FUJITSU)
++    {
++        if(strstr(gmx_cpuid_brand(cpuid),"SPARC64"))
++        {
++            tmpacc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE;
++        }
++    }
 +    return tmpacc;
 +}
 +
 +
 +
 +int
 +gmx_cpuid_acceleration_check(gmx_cpuid_t   cpuid,
 +                             FILE *        log)
 +{
 +    int                           rc;
 +    char                          str[1024];
 +    enum gmx_cpuid_acceleration   acc;
 +
 +    acc = gmx_cpuid_acceleration_suggest(cpuid);
 +
 +    rc = (acc != compiled_acc);
 +
 +    gmx_cpuid_formatstring(cpuid, str, 1023);
 +    str[1023] = '\0';
 +
 +    if (log != NULL)
 +    {
 +        fprintf(log,
 +                "\nDetecting CPU-specific acceleration.\nPresent hardware specification:\n"
 +                "%s"
 +                "Acceleration most likely to fit this hardware: %s\n"
 +                "Acceleration selected at GROMACS compile time: %s\n\n",
 +                str,
 +                gmx_cpuid_acceleration_string[acc],
 +                gmx_cpuid_acceleration_string[compiled_acc]);
 +    }
 +
 +    if (rc != 0)
 +    {
 +        if (log != NULL)
 +        {
 +            fprintf(log, "\nBinary not matching hardware - you might be losing performance.\n"
 +                    "Acceleration most likely to fit this hardware: %s\n"
 +                    "Acceleration selected at GROMACS compile time: %s\n\n",
 +                    gmx_cpuid_acceleration_string[acc],
 +                    gmx_cpuid_acceleration_string[compiled_acc]);
 +        }
 +        printf("Compiled acceleration: %s (Gromacs could use %s on this machine, which is better)\n",
 +               gmx_cpuid_acceleration_string[compiled_acc],
 +               gmx_cpuid_acceleration_string[acc]);
 +    }
 +    return rc;
 +}
 +
 +
 +#ifdef GMX_CPUID_STANDALONE
 +/* Stand-alone program to enable queries of CPU features from Cmake.
 + * Note that you need to check inline ASM capabilities before compiling and set
 + * -DGMX_X86_GCC_INLINE_ASM for the cpuid instruction to work...
 + */
 +int
 +main(int argc, char **argv)
 +{
 +    gmx_cpuid_t                   cpuid;
 +    enum gmx_cpuid_acceleration   acc;
 +    int                           i, cnt;
 +
 +    if (argc < 2)
 +    {
 +        fprintf(stdout,
 +                "Usage:\n\n%s [flags]\n\n"
 +                "Available flags:\n"
 +                "-vendor        Print CPU vendor.\n"
 +                "-brand         Print CPU brand string.\n"
 +                "-family        Print CPU family version.\n"
 +                "-model         Print CPU model version.\n"
 +                "-stepping      Print CPU stepping version.\n"
 +                "-features      Print CPU feature flags.\n"
 +                "-acceleration  Print suggested GROMACS acceleration.\n",
 +                argv[0]);
 +        exit(0);
 +    }
 +
 +    gmx_cpuid_init(&cpuid);
 +
 +    if (!strncmp(argv[1], "-vendor", 3))
 +    {
 +        printf("%s\n", gmx_cpuid_vendor_string[cpuid->vendor]);
 +    }
 +    else if (!strncmp(argv[1], "-brand", 3))
 +    {
 +        printf("%s\n", cpuid->brand);
 +    }
 +    else if (!strncmp(argv[1], "-family", 3))
 +    {
 +        printf("%d\n", cpuid->family);
 +    }
 +    else if (!strncmp(argv[1], "-model", 3))
 +    {
 +        printf("%d\n", cpuid->model);
 +    }
 +    else if (!strncmp(argv[1], "-stepping", 3))
 +    {
 +        printf("%d\n", cpuid->stepping);
 +    }
 +    else if (!strncmp(argv[1], "-features", 3))
 +    {
 +        cnt = 0;
 +        for (i = 0; i < GMX_CPUID_NFEATURES; i++)
 +        {
 +            if (cpuid->feature[i] == 1)
 +            {
 +                if (cnt++ > 0)
 +                {
 +                    printf(" ");
 +                }
 +                printf("%s", gmx_cpuid_feature_string[i]);
 +            }
 +        }
 +        printf("\n");
 +    }
 +    else if (!strncmp(argv[1], "-acceleration", 3))
 +    {
 +        acc = gmx_cpuid_acceleration_suggest(cpuid);
 +        fprintf(stdout, "%s\n", gmx_cpuid_acceleration_string[acc]);
 +    }
 +
 +    gmx_cpuid_done(cpuid);
 +
 +
 +    return 0;
 +}
 +
 +#endif
index 5fcbe2da7f71d7c759415118c4a757961f2671f3,0000000000000000000000000000000000000000..1787371a5f37fd122c0027ace84965034af85c4f
mode 100644,000000..100644
--- /dev/null
@@@ -1,40 -1,0 +1,45 @@@
- if(GMX_CPU_ACCELERATION STREQUAL "SSE2" AND NOT GMX_DOUBLE)
 +# Sources that should always be built
 +file(GLOB NONBONDED_SOURCES *.c nb_kernel_c/*.c)
 +
- if(GMX_CPU_ACCELERATION STREQUAL "SSE4.1" AND NOT GMX_DOUBLE)
++if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE2" AND NOT GMX_DOUBLE)
 +    file(GLOB NONBONDED_SSE2_SINGLE_SOURCES nb_kernel_sse2_single/*.c)
 +endif()
 +
- if(GMX_CPU_ACCELERATION STREQUAL "AVX_128_FMA" AND NOT GMX_DOUBLE)
++if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE4.1" AND NOT GMX_DOUBLE)
 +    file(GLOB NONBONDED_SSE4_1_SINGLE_SOURCES nb_kernel_sse4_1_single/*.c)
 +endif()
 +
- if(GMX_CPU_ACCELERATION STREQUAL "AVX_256" AND NOT GMX_DOUBLE)
++if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_128_FMA" AND NOT GMX_DOUBLE)
 +    file(GLOB NONBONDED_AVX_128_FMA_SINGLE_SOURCES nb_kernel_avx_128_fma_single/*.c)
 +endif()
 +
- if(GMX_CPU_ACCELERATION STREQUAL "SSE2" AND GMX_DOUBLE)
++if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_256" AND NOT GMX_DOUBLE)
 +    file(GLOB NONBONDED_AVX_256_SINGLE_SOURCES nb_kernel_avx_256_single/*.c)
 +endif()
 +
- if(GMX_CPU_ACCELERATION STREQUAL "SSE4.1" AND GMX_DOUBLE)
++if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE2" AND GMX_DOUBLE)
 +    file(GLOB NONBONDED_SSE2_DOUBLE_SOURCES nb_kernel_sse2_double/*.c)
 +endif()
 +
- if(GMX_CPU_ACCELERATION STREQUAL "AVX_128_FMA" AND GMX_DOUBLE)
++if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE4.1" AND GMX_DOUBLE)
 +    file(GLOB NONBONDED_SSE4_1_DOUBLE_SOURCES nb_kernel_sse4_1_double/*.c)
 +endif()
 +
- if(GMX_CPU_ACCELERATION STREQUAL "AVX_256" AND GMX_DOUBLE)
++if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_128_FMA" AND GMX_DOUBLE)
 +    file(GLOB NONBONDED_AVX_128_FMA_DOUBLE_SOURCES nb_kernel_avx_128_fma_double/*.c)
 +endif()
 +
- set(NONBONDED_SOURCES ${NONBONDED_SOURCES} ${NONBONDED_SSE2_SINGLE_SOURCES} ${NONBONDED_SSE4_1_SINGLE_SOURCES} ${NONBONDED_AVX_128_FMA_SINGLE_SOURCES} ${NONBONDED_AVX_256_SINGLE_SOURCES} ${NONBONDED_SSE2_DOUBLE_SOURCES} ${NONBONDED_SSE4_1_DOUBLE_SOURCES} ${NONBONDED_AVX_128_FMA_DOUBLE_SOURCES} ${NONBONDED_AVX_256_DOUBLE_SOURCES} PARENT_SCOPE)
++if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_256" AND GMX_DOUBLE)
 +    file(GLOB NONBONDED_AVX_256_DOUBLE_SOURCES nb_kernel_avx_256_double/*.c)
 +endif()
 +
++if("${GMX_CPU_ACCELERATION}" STREQUAL "Sparc64_HPC_ACE" AND GMX_DOUBLE)
++    file(GLOB NONBONDED_SPARC64_HPC_ACE_DOUBLE_SOURCES nb_kernel_sparc64_hpc_ace_double/*.c)
++endif()
++
++
 +# These sources will be used in the parent directory's CMakeLists.txt
++set(NONBONDED_SOURCES ${NONBONDED_SOURCES} ${NONBONDED_SSE2_SINGLE_SOURCES} ${NONBONDED_SSE4_1_SINGLE_SOURCES} ${NONBONDED_AVX_128_FMA_SINGLE_SOURCES} ${NONBONDED_AVX_256_SINGLE_SOURCES} ${NONBONDED_SSE2_DOUBLE_SOURCES} ${NONBONDED_SSE4_1_DOUBLE_SOURCES} ${NONBONDED_AVX_128_FMA_DOUBLE_SOURCES} ${NONBONDED_AVX_256_DOUBLE_SOURCES} ${NONBONDED_SPARC64_HPC_ACE_DOUBLE_SOURCES} PARENT_SCOPE)
 +
 +
 +
index 0000000000000000000000000000000000000000,dfd38394f3758f5a22985eb3020f321f57f5d907..dfd38394f3758f5a22985eb3020f321f57f5d907
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,945 +1,945 @@@
+ /*
+  *                This source code is part of
+  *
+  *                 G   R   O   M   A   C   S
+  *
+  * Copyright (c) 2011-2012, The GROMACS Development Team
+  *
+  * Gromacs is a library for molecular simulation and trajectory analysis,
+  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+  * a full list of developers and information, check out http://www.gromacs.org
+  *
+  * This program is free software; you can redistribute it and/or modify it under 
+  * the terms of the GNU Lesser General Public License as published by the Free 
+  * Software Foundation; either version 2 of the License, or (at your option) any 
+  * later version.
+  * As a special exception, you may use this file as part of a free software
+  * library without restriction.  Specifically, if other files instantiate
+  * templates or use macros or inline functions from this file, or you compile
+  * this file and link it with other files to produce an executable, this
+  * file does not by itself cause the resulting executable to be covered by
+  * the GNU Lesser General Public License.
+  *
+  * In plain-speak: do not worry about classes/macros/templates either - only
+  * changes to the library have to be LGPL, not an application linking with it.
+  *
+  * To help fund GROMACS development, we humbly ask that you cite
+  * the papers people have written on it - you can find them on the website!
+  */
+ #ifndef _kernelutil_sparc64_hpc_ace_double_h_
+ #define _kernelutil_sparc64_hpc_ace_double_h_
+ /* Fujitsu header borrows the name from SSE2, since some instructions have aliases */
+ #include "emmintrin.h"
+ #define GMX_FJSP_SHUFFLE2(x,y) (((x)<<1) | (y))
+ #define GMX_FJSP_TRANSPOSE2_V2R8(row0, row1) {           \
+     _fjsp_v2r8 __gmx_t1 = row0;                          \
+     row0           = _fjsp_unpacklo_v2r8(row0,row1);     \
+     row1           = _fjsp_unpackhi_v2r8(__gmx_t1,row1); \
+ }
+ static void
+ gmx_fjsp_print_v2r8(const char *s, _fjsp_v2r8 a)
+ {
+   double lo,hi;
+   _fjsp_storel_v2r8(&lo,a);
+   _fjsp_storeh_v2r8(&hi,a);
+   printf("%s: %g %g\n",s,lo,hi);
+ }
+ static _fjsp_v2r8
+ gmx_fjsp_set1_v2r8(double d)
+ {
+     return _fjsp_set_v2r8(d,d);
+ }
+ static _fjsp_v2r8
+ gmx_fjsp_load1_v2r8(const double * gmx_restrict ptr)
+ {
+     return gmx_fjsp_set1_v2r8(*ptr);
+ }
+ static int
+ gmx_fjsp_any_lt_v2r8(_fjsp_v2r8 a, _fjsp_v2r8 b)
+ {
+     union
+     {
+         double           d;
+         long long int    i;
+     }
+     conv;
+     
+     a = _fjsp_cmplt_v2r8(a,b);
+     a = _fjsp_or_v2r8(a, _fjsp_unpackhi_v2r8(a,a));
+     _fjsp_storel_v2r8(&(conv.d),a);
+     return (conv.i != 0);
+ }
+ /* 1.0/sqrt(x) */
+ static gmx_inline _fjsp_v2r8
+ gmx_fjsp_invsqrt_v2r8(_fjsp_v2r8 x)
+ {
+     const _fjsp_v2r8 half  = gmx_fjsp_set1_v2r8(0.5);
+     const _fjsp_v2r8 three = gmx_fjsp_set1_v2r8(3.0);
+     _fjsp_v2r8 lu = _fjsp_rsqrta_v2r8(x);
+     
+     lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+     /* The HPC-ACE instruction set is only available in double precision, while
+      * single precision is typically sufficient for Gromacs. If you define 
+      * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson 
+      * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full 
+      * double precision (53 bits). This is still clearly higher than single precision (24 bits).
+      */
+ #ifndef GMX_RELAXED_DOUBLE_PRECISION
+     lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+ #endif
+     return _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+ }
+ /* 1.0/x */
+ static gmx_inline _fjsp_v2r8
+ gmx_fjsp_inv_v2r8(_fjsp_v2r8 x)
+ {
+     const _fjsp_v2r8 two  = gmx_fjsp_set1_v2r8(2.0);    
+     __m128d lu = _fjsp_rcpa_v2r8(x);
+     
+     /* Perform three N-R steps for double precision */
+     lu         = _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+     /* The HPC-ACE instruction set is only available in double precision, while
+      * single precision is typically sufficient for Gromacs. If you define
+      * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
+      * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
+      * double precision (53 bits). This is still clearly higher than single precision (24 bits).
+      */
+ #ifndef GMX_RELAXED_DOUBLE_PRECISION
+     lu         = _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+ #endif
+     return _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+ }
+ static gmx_inline _fjsp_v2r8
+ gmx_fjsp_calc_rsq_v2r8(_fjsp_v2r8 dx, _fjsp_v2r8 dy, _fjsp_v2r8 dz)
+ {
+     return _fjsp_madd_v2r8(dx,dx,_fjsp_madd_v2r8(dy,dy,_fjsp_mul_v2r8(dz,dz)));
+ }
+ /* Normal sum of four ymm registers */
+ #define gmx_fjsp_sum4_v2r8(t0,t1,t2,t3)  _fjsp_add_v2r8(_fjsp_add_v2r8(t0,t1),_fjsp_add_v2r8(t2,t3))
+ static _fjsp_v2r8
+ gmx_fjsp_load_2real_swizzle_v2r8(const double * gmx_restrict ptrA,
+                                  const double * gmx_restrict ptrB)
+ {
+     return _fjsp_unpacklo_v2r8(_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA),_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB));
+ }
+ static _fjsp_v2r8
+ gmx_fjsp_load_1real_v2r8(const double * gmx_restrict ptrA)
+ {
+     return _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
+ }
+ static void
+ gmx_fjsp_store_2real_swizzle_v2r8(double * gmx_restrict ptrA,
+                                 double * gmx_restrict ptrB,
+                                 _fjsp_v2r8 xmm1)
+ {
+     _fjsp_v2r8 t2;
+     
+     t2       = _fjsp_unpackhi_v2r8(xmm1,xmm1);
+     _fjsp_storel_v2r8(ptrA,xmm1);                                           
+     _fjsp_storel_v2r8(ptrB,t2);                                         
+ }
+ static void
+ gmx_fjsp_store_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
+ {
+     _fjsp_storel_v2r8(ptrA,xmm1);
+ }
+ /* Similar to store, but increments value in memory */
+ static void
+ gmx_fjsp_increment_2real_swizzle_v2r8(double * gmx_restrict ptrA,
+                                     double * gmx_restrict ptrB, _fjsp_v2r8 xmm1)
+ {
+     _fjsp_v2r8 t1;
+     
+     t1   = _fjsp_unpackhi_v2r8(xmm1,xmm1);
+     xmm1 = _fjsp_add_v2r8(xmm1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA));
+     t1   = _fjsp_add_v2r8(t1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB));
+     _fjsp_storel_v2r8(ptrA,xmm1);
+     _fjsp_storel_v2r8(ptrB,t1);
+ }
+ static void
+ gmx_fjsp_increment_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
+ {
+     _fjsp_v2r8 tmp;
+     
+     tmp = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
+     tmp = _fjsp_add_v2r8(tmp,xmm1);
+     _fjsp_storel_v2r8(ptrA,tmp);
+ }
+ static gmx_inline void
+ gmx_fjsp_load_2pair_swizzle_v2r8(const double * gmx_restrict p1,
+                              const double * gmx_restrict p2,
+                              _fjsp_v2r8 * gmx_restrict c6,
+                              _fjsp_v2r8 * gmx_restrict c12)
+ {
+     _fjsp_v2r8 t1,t2,t3;
+     
+     /* The c6/c12 array should be aligned */
+     t1   = _fjsp_load_v2r8(p1);
+     t2   = _fjsp_load_v2r8(p2);
+     *c6  = _fjsp_unpacklo_v2r8(t1,t2);  
+     *c12 = _fjsp_unpackhi_v2r8(t1,t2);                    
+ }
+ static gmx_inline void
+ gmx_fjsp_load_1pair_swizzle_v2r8(const double * gmx_restrict p1,
+                              _fjsp_v2r8 * gmx_restrict c6,
+                              _fjsp_v2r8 * gmx_restrict c12)
+ {
+     *c6     = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
+     *c12    = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
+ }
+ static gmx_inline void
+ gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
+                                          const double * gmx_restrict xyz,
+                                          _fjsp_v2r8 * gmx_restrict x1,
+                                          _fjsp_v2r8 * gmx_restrict y1,
+                                          _fjsp_v2r8 * gmx_restrict z1)
+ {
+     _fjsp_v2r8 mem_xy,mem_z,mem_sxy,mem_sz;
+     
+     mem_xy  = _fjsp_load_v2r8(xyz);
+     mem_z   = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz+2);
+     mem_sxy = _fjsp_load_v2r8(xyz_shift);
+     mem_sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
+     
+     mem_xy  = _fjsp_add_v2r8(mem_xy,mem_sxy);
+     mem_z   = _fjsp_add_v2r8(mem_z,mem_sz);
+     
+     *x1  = _fjsp_shuffle_v2r8(mem_xy,mem_xy,GMX_FJSP_SHUFFLE2(0,0));
+     *y1  = _fjsp_shuffle_v2r8(mem_xy,mem_xy,GMX_FJSP_SHUFFLE2(1,1));
+     *z1  = _fjsp_shuffle_v2r8(mem_z,mem_z,GMX_FJSP_SHUFFLE2(0,0));
+ }
+ static gmx_inline void
+ gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
+                                          const double * gmx_restrict xyz,
+                                          _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                          _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                          _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+ {
+     _fjsp_v2r8 t1,t2,t3,t4,t5,sxy,sz,szx,syz;
+     
+     t1  = _fjsp_load_v2r8(xyz);
+     t2  = _fjsp_load_v2r8(xyz+2);
+     t3  = _fjsp_load_v2r8(xyz+4);
+     t4  = _fjsp_load_v2r8(xyz+6);
+     t5  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz+8);
+     
+     sxy = _fjsp_load_v2r8(xyz_shift);
+     sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
+     szx = _fjsp_shuffle_v2r8(sz,sxy,GMX_FJSP_SHUFFLE2(0,0));
+     syz = _fjsp_shuffle_v2r8(sxy,sz,GMX_FJSP_SHUFFLE2(0,1));
+     
+     t1  = _fjsp_add_v2r8(t1,sxy);
+     t2  = _fjsp_add_v2r8(t2,szx);
+     t3  = _fjsp_add_v2r8(t3,syz);
+     t4  = _fjsp_add_v2r8(t4,sxy);
+     t5  = _fjsp_add_v2r8(t5,sz);
+     
+     *x1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(0,0));
+     *y1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(1,1));
+     *z1  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(0,0));
+     *x2  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(1,1));
+     *y2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(0,0));
+     *z2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(1,1));
+     *x3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(0,0));
+     *y3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(1,1));
+     *z3  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(0,0));
+ }
+ static gmx_inline void
+ gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
+                                          const double * gmx_restrict xyz,
+                                          _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                          _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                          _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                          _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+ {
+     _fjsp_v2r8 t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
+     
+     t1  = _fjsp_load_v2r8(xyz);
+     t2  = _fjsp_load_v2r8(xyz+2);
+     t3  = _fjsp_load_v2r8(xyz+4);
+     t4  = _fjsp_load_v2r8(xyz+6);
+     t5  = _fjsp_load_v2r8(xyz+8);
+     t6  = _fjsp_load_v2r8(xyz+10);
+     
+     sxy = _fjsp_load_v2r8(xyz_shift);
+     sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
+     szx = _fjsp_shuffle_v2r8(sz,sxy,GMX_FJSP_SHUFFLE2(0,0));
+     syz = _fjsp_shuffle_v2r8(sxy,sz,GMX_FJSP_SHUFFLE2(0,1));
+     
+     t1  = _fjsp_add_v2r8(t1,sxy);
+     t2  = _fjsp_add_v2r8(t2,szx);
+     t3  = _fjsp_add_v2r8(t3,syz);
+     t4  = _fjsp_add_v2r8(t4,sxy);
+     t5  = _fjsp_add_v2r8(t5,szx);
+     t6  = _fjsp_add_v2r8(t6,syz);
+     
+     *x1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(0,0));
+     *y1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(1,1));
+     *z1  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(0,0));
+     *x2  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(1,1));
+     *y2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(0,0));
+     *z2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(1,1));
+     *x3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(0,0));
+     *y3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(1,1));
+     *z3  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(0,0));
+     *x4  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(1,1));
+     *y4  = _fjsp_shuffle_v2r8(t6,t6,GMX_FJSP_SHUFFLE2(0,0));
+     *z4  = _fjsp_shuffle_v2r8(t6,t6,GMX_FJSP_SHUFFLE2(1,1));
+ }
+ static gmx_inline void
+ gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
+                                   _fjsp_v2r8 * gmx_restrict x, _fjsp_v2r8 * gmx_restrict y, _fjsp_v2r8 * gmx_restrict z)
+ {
+        *x            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
+      *y            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
+      *z            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
+ }
+ static gmx_inline void
+ gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
+                                   _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                   _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                   _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+ {
+        *x1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
+      *y1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
+      *z1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
+        *x2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+3);
+      *y2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+4);
+      *z2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+5);
+        *x3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+6);
+      *y3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+7);
+      *z3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+8);
+ }
+ static gmx_inline void
+ gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
+                                   _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                   _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                   _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                   _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+ {
+     *x1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
+     *y1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
+     *z1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
+     *x2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+3);
+     *y2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+4);
+     *z2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+5);
+     *x3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+6);
+     *y3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+7);
+     *z3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+8);
+     *x4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+9);
+     *y4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+10);
+     *z4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+11);
+ }
+ static gmx_inline void
+ gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA,
+                                   const double * gmx_restrict ptrB,
+                                   _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1)
+ {
+     _fjsp_v2r8 t1,t2,t3,t4;
+     t1           = _fjsp_load_v2r8(ptrA);
+     t2           = _fjsp_load_v2r8(ptrB);
+     t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+     t4           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
+     GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
+     *x1          = t1;
+     *y1          = t2;
+     *z1          = _fjsp_unpacklo_v2r8(t3,t4);
+ }
+ static gmx_inline void
+ gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
+                                   _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                   _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                   _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+ {
+ _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+     t1           = _fjsp_load_v2r8(ptrA);
+     t2           = _fjsp_load_v2r8(ptrB);
+     t3           = _fjsp_load_v2r8(ptrA+2);
+     t4           = _fjsp_load_v2r8(ptrB+2);
+     t5           = _fjsp_load_v2r8(ptrA+4);
+     t6           = _fjsp_load_v2r8(ptrB+4);
+     t7           = _fjsp_load_v2r8(ptrA+6);
+     t8           = _fjsp_load_v2r8(ptrB+6);
+     t9           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
+     t10          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+8);
+     GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
+     GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
+     GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
+     GMX_FJSP_TRANSPOSE2_V2R8(t7,t8);
+     *x1          = t1;
+     *y1          = t2;
+     *z1          = t3;
+     *x2          = t4;
+     *y2          = t5;
+     *z2          = t6;
+     *x3          = t7;
+     *y3          = t8;
+     *z3          = _fjsp_unpacklo_v2r8(t9,t10);
+ }
+ static gmx_inline void
+ gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
+                                   _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                   _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                   _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                   _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+ {
+     _fjsp_v2r8 t1,t2,t3,t4,t5,t6;
+     t1           = _fjsp_load_v2r8(ptrA);
+     t2           = _fjsp_load_v2r8(ptrB);
+     t3           = _fjsp_load_v2r8(ptrA+2);
+     t4           = _fjsp_load_v2r8(ptrB+2);
+     t5           = _fjsp_load_v2r8(ptrA+4);
+     t6           = _fjsp_load_v2r8(ptrB+4);
+     GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
+     GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
+     GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
+     *x1          = t1;
+     *y1          = t2;
+     *z1          = t3;
+     *x2          = t4;
+     *y2          = t5;
+     *z2          = t6;
+     t1           = _fjsp_load_v2r8(ptrA+6);
+     t2           = _fjsp_load_v2r8(ptrB+6);
+     t3           = _fjsp_load_v2r8(ptrA+8);
+     t4           = _fjsp_load_v2r8(ptrB+8);
+     t5           = _fjsp_load_v2r8(ptrA+10);
+     t6           = _fjsp_load_v2r8(ptrB+10);
+     GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
+     GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
+     GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
+     *x3          = t1;
+     *y3          = t2;
+     *z3          = t3;
+     *x4          = t4;
+     *y4          = t5;
+     *z4          = t6;
+ }
+ static void
+ gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
+                                        _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
+ {
+     _fjsp_v2r8 t1,t2,t3;
+     
+     t1           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
+     t2           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+1);
+     t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+     
+     t1           = _fjsp_sub_v2r8(t1,x1);
+     t2           = _fjsp_sub_v2r8(t2,y1);
+     t3           = _fjsp_sub_v2r8(t3,z1);
+     _fjsp_storel_v2r8(ptrA,t1);
+     _fjsp_storel_v2r8(ptrA+1,t2);
+     _fjsp_storel_v2r8(ptrA+2,t3);
+ }
+ static void
+ gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 fscal,
+                                          _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
+ {
+   _fjsp_v2r8 t1,t2,t3;
+   t1           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
+   t2           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+1);
+   t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+   t1           = _fjsp_nmsub_v2r8(fscal,dx1,t1);
+   t2           = _fjsp_nmsub_v2r8(fscal,dy1,t2);
+   t3           = _fjsp_nmsub_v2r8(fscal,dz1,t3);
+   _fjsp_storel_v2r8(ptrA,t1);
+   _fjsp_storel_v2r8(ptrA+1,t2);
+   _fjsp_storel_v2r8(ptrA+2,t3);
+ }
+ static void
+ gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
+                                        _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                        _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                        _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3) 
+ {
+     _fjsp_v2r8 t1,t2,t3,t4,t5;
+     
+     t1          = _fjsp_load_v2r8(ptrA);
+     t2          = _fjsp_load_v2r8(ptrA+2);
+     t3          = _fjsp_load_v2r8(ptrA+4);
+     t4          = _fjsp_load_v2r8(ptrA+6);
+     t5          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
+     
+     x1          = _fjsp_unpacklo_v2r8(x1,y1);
+     z1          = _fjsp_unpacklo_v2r8(z1,x2);
+     y2          = _fjsp_unpacklo_v2r8(y2,z2);
+     x3          = _fjsp_unpacklo_v2r8(x3,y3);
+     /* nothing to be done for z3 */
+     
+     t1          = _fjsp_sub_v2r8(t1,x1);
+     t2          = _fjsp_sub_v2r8(t2,z1);
+     t3          = _fjsp_sub_v2r8(t3,y2);
+     t4          = _fjsp_sub_v2r8(t4,x3);
+     t5          = _fjsp_sub_v2r8(t5,z3);
+     _fjsp_storel_v2r8(ptrA,t1);
+     _fjsp_storeh_v2r8(ptrA+1,t1);
+     _fjsp_storel_v2r8(ptrA+2,t2);
+     _fjsp_storeh_v2r8(ptrA+3,t2);
+     _fjsp_storel_v2r8(ptrA+4,t3);
+     _fjsp_storeh_v2r8(ptrA+5,t3);
+     _fjsp_storel_v2r8(ptrA+6,t4);
+     _fjsp_storeh_v2r8(ptrA+7,t4);
+     _fjsp_storel_v2r8(ptrA+8,t5);
+ }
+ static void
+ gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
+                                        _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                        _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                        _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
+                                        _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4) 
+ {
+     _fjsp_v2r8 t1,t2,t3,t4,t5,t6;
+     
+     t1          = _fjsp_load_v2r8(ptrA);
+     t2          = _fjsp_load_v2r8(ptrA+2);
+     t3          = _fjsp_load_v2r8(ptrA+4);
+     t4          = _fjsp_load_v2r8(ptrA+6);
+     t5          = _fjsp_load_v2r8(ptrA+8);
+     t6          = _fjsp_load_v2r8(ptrA+10);
+     
+     x1          = _fjsp_unpacklo_v2r8(x1,y1);
+     z1          = _fjsp_unpacklo_v2r8(z1,x2);
+     y2          = _fjsp_unpacklo_v2r8(y2,z2);
+     x3          = _fjsp_unpacklo_v2r8(x3,y3);
+     z3          = _fjsp_unpacklo_v2r8(z3,x4);
+     y4          = _fjsp_unpacklo_v2r8(y4,z4);
+     
+     _fjsp_storel_v2r8(ptrA,    _fjsp_sub_v2r8( t1,x1 ));
+     _fjsp_storeh_v2r8(ptrA+1,  _fjsp_sub_v2r8( t1,x1 ));
+     _fjsp_storel_v2r8(ptrA+2,  _fjsp_sub_v2r8( t2,z1 ));
+     _fjsp_storeh_v2r8(ptrA+3,  _fjsp_sub_v2r8( t2,z1 ));
+     _fjsp_storel_v2r8(ptrA+4,  _fjsp_sub_v2r8( t3,y2 ));
+     _fjsp_storeh_v2r8(ptrA+5,  _fjsp_sub_v2r8( t3,y2 ));
+     _fjsp_storel_v2r8(ptrA+6,  _fjsp_sub_v2r8( t4,x3 ));
+     _fjsp_storeh_v2r8(ptrA+7,  _fjsp_sub_v2r8( t4,x3 ));
+     _fjsp_storel_v2r8(ptrA+8,  _fjsp_sub_v2r8( t5,z3 ));
+     _fjsp_storeh_v2r8(ptrA+9,  _fjsp_sub_v2r8( t5,z3 ));
+     _fjsp_storel_v2r8(ptrA+10, _fjsp_sub_v2r8( t6,y4 ));
+     _fjsp_storeh_v2r8(ptrA+11, _fjsp_sub_v2r8( t6,y4 ));
+ }
+ static void
+ gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
+                                          _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
+ {
+   _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7;
+     
+   t1          = _fjsp_load_v2r8(ptrA);
+   t2          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+   t3          = _fjsp_load_v2r8(ptrB);
+   t4          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
+     
+   t5          = _fjsp_unpacklo_v2r8(x1,y1);
+   t6          = _fjsp_unpackhi_v2r8(x1,y1);
+   t7          = _fjsp_unpackhi_v2r8(z1,z1);
+     
+   t1          = _fjsp_sub_v2r8(t1,t5);
+   t2          = _fjsp_sub_v2r8(t2,z1);
+     
+   t3          = _fjsp_sub_v2r8(t3,t6);
+   t4          = _fjsp_sub_v2r8(t4,t7);
+     
+   _fjsp_storel_v2r8(ptrA,t1);
+   _fjsp_storeh_v2r8(ptrA+1,t1);
+   _fjsp_storel_v2r8(ptrA+2,t2);
+   _fjsp_storel_v2r8(ptrB,t3);
+   _fjsp_storeh_v2r8(ptrB+1,t3);
+   _fjsp_storel_v2r8(ptrB+2,t4);
+ }
+ static void
+ gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
+                                              _fjsp_v2r8 fscal, _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
+ {
+   _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,fscalA,fscalB;
+     
+     t1          = _fjsp_load_v2r8(ptrA);
+     t2          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+     t3          = _fjsp_load_v2r8(ptrB);
+     t4          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
+     fscalA      = _fjsp_unpacklo_v2r8(fscal,fscal);
+     fscalB      = _fjsp_unpackhi_v2r8(fscal,fscal);
+     
+     t5          = _fjsp_unpacklo_v2r8(dx1,dy1);
+     t6          = _fjsp_unpackhi_v2r8(dx1,dy1);
+     t7          = _fjsp_unpackhi_v2r8(dz1,dz1);
+     
+     t1          = _fjsp_nmsub_v2r8(fscalA,t5,t1);
+     t2          = _fjsp_nmsub_v2r8(fscalA,dz1,t2);
+     
+     t3          = _fjsp_nmsub_v2r8(fscalB,t6,t3);
+     t4          = _fjsp_nmsub_v2r8(fscalB,t7,t4);
+     
+     _fjsp_storel_v2r8(ptrA,t1);
+     _fjsp_storeh_v2r8(ptrA+1,t1);
+     _fjsp_storel_v2r8(ptrA+2,t2);
+     _fjsp_storel_v2r8(ptrB,t3);
+     _fjsp_storeh_v2r8(ptrB+1,t3);
+     _fjsp_storel_v2r8(ptrB+2,t4);
+ }
+ static void
+ gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
+                                        _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                        _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                        _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3) 
+ {
+     _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+     _fjsp_v2r8 tA,tB,tC,tD,tE,tF,tG,tH,tI;
+     
+     t1          = _fjsp_load_v2r8(ptrA);
+     t2          = _fjsp_load_v2r8(ptrA+2);
+     t3          = _fjsp_load_v2r8(ptrA+4);
+     t4          = _fjsp_load_v2r8(ptrA+6);
+     t5          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
+     t6          = _fjsp_load_v2r8(ptrB);
+     t7          = _fjsp_load_v2r8(ptrB+2);
+     t8          = _fjsp_load_v2r8(ptrB+4);
+     t9          = _fjsp_load_v2r8(ptrB+6);
+     t10         = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+8);
+     
+     tA          = _fjsp_unpacklo_v2r8(x1,y1);
+     tB          = _fjsp_unpackhi_v2r8(x1,y1);
+     tC          = _fjsp_unpacklo_v2r8(z1,x2);
+     tD          = _fjsp_unpackhi_v2r8(z1,x2);
+     tE          = _fjsp_unpacklo_v2r8(y2,z2);
+     tF          = _fjsp_unpackhi_v2r8(y2,z2);
+     tG          = _fjsp_unpacklo_v2r8(x3,y3);
+     tH          = _fjsp_unpackhi_v2r8(x3,y3);
+     tI          = _fjsp_unpackhi_v2r8(z3,z3);
+     
+     t1          = _fjsp_sub_v2r8(t1,tA);
+     t2          = _fjsp_sub_v2r8(t2,tC);
+     t3          = _fjsp_sub_v2r8(t3,tE);
+     t4          = _fjsp_sub_v2r8(t4,tG);
+     t5          = _fjsp_sub_v2r8(t5,z3);
+     
+     t6          = _fjsp_sub_v2r8(t6,tB);
+     t7          = _fjsp_sub_v2r8(t7,tD);
+     t8          = _fjsp_sub_v2r8(t8,tF);
+     t9          = _fjsp_sub_v2r8(t9,tH);
+     t10         = _fjsp_sub_v2r8(t10,tI);
+     
+     _fjsp_storel_v2r8(ptrA,t1);
+     _fjsp_storeh_v2r8(ptrA+1,t1);
+     _fjsp_storel_v2r8(ptrA+2,t2);
+     _fjsp_storeh_v2r8(ptrA+3,t2);
+     _fjsp_storel_v2r8(ptrA+4,t3);
+     _fjsp_storeh_v2r8(ptrA+5,t3);
+     _fjsp_storel_v2r8(ptrA+6,t4);
+     _fjsp_storeh_v2r8(ptrA+7,t4);
+     _fjsp_storel_v2r8(ptrA+8,t5);
+     _fjsp_storel_v2r8(ptrB,t6);
+     _fjsp_storeh_v2r8(ptrB+1,t6);
+     _fjsp_storel_v2r8(ptrB+2,t7);
+     _fjsp_storeh_v2r8(ptrB+3,t7);
+     _fjsp_storel_v2r8(ptrB+4,t8);
+     _fjsp_storeh_v2r8(ptrB+5,t8);
+     _fjsp_storel_v2r8(ptrB+6,t9);
+     _fjsp_storeh_v2r8(ptrB+7,t9);
+     _fjsp_storel_v2r8(ptrB+8,t10);
+ }
+ static void
+ gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
+                                        _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                        _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                        _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
+                                        _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4) 
+ {
+     _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
+     _fjsp_v2r8 tA,tB,tC,tD,tE,tF,tG,tH,tI,tJ,tK,tL;
+     
+     t1          = _fjsp_load_v2r8(ptrA);
+     t2          = _fjsp_load_v2r8(ptrA+2);
+     t3          = _fjsp_load_v2r8(ptrA+4);
+     t4          = _fjsp_load_v2r8(ptrA+6);
+     t5          = _fjsp_load_v2r8(ptrA+8);
+     t6          = _fjsp_load_v2r8(ptrA+10);
+     t7          = _fjsp_load_v2r8(ptrB);
+     t8          = _fjsp_load_v2r8(ptrB+2);
+     t9          = _fjsp_load_v2r8(ptrB+4);
+     t10         = _fjsp_load_v2r8(ptrB+6);
+     t11         = _fjsp_load_v2r8(ptrB+8);
+     t12         = _fjsp_load_v2r8(ptrB+10);
+     
+     tA          = _fjsp_unpacklo_v2r8(x1,y1);
+     tB          = _fjsp_unpackhi_v2r8(x1,y1);
+     tC          = _fjsp_unpacklo_v2r8(z1,x2);
+     tD          = _fjsp_unpackhi_v2r8(z1,x2);
+     tE          = _fjsp_unpacklo_v2r8(y2,z2);
+     tF          = _fjsp_unpackhi_v2r8(y2,z2);
+     tG          = _fjsp_unpacklo_v2r8(x3,y3);
+     tH          = _fjsp_unpackhi_v2r8(x3,y3);
+     tI          = _fjsp_unpacklo_v2r8(z3,x4);
+     tJ          = _fjsp_unpackhi_v2r8(z3,x4);
+     tK          = _fjsp_unpacklo_v2r8(y4,z4);
+     tL          = _fjsp_unpackhi_v2r8(y4,z4);
+     
+     t1          = _fjsp_sub_v2r8(t1,tA);
+     t2          = _fjsp_sub_v2r8(t2,tC);
+     t3          = _fjsp_sub_v2r8(t3,tE);
+     t4          = _fjsp_sub_v2r8(t4,tG);
+     t5          = _fjsp_sub_v2r8(t5,tI);
+     t6          = _fjsp_sub_v2r8(t6,tK);
+     
+     t7          = _fjsp_sub_v2r8(t7,tB);
+     t8          = _fjsp_sub_v2r8(t8,tD);
+     t9          = _fjsp_sub_v2r8(t9,tF);
+     t10         = _fjsp_sub_v2r8(t10,tH);
+     t11         = _fjsp_sub_v2r8(t11,tJ);
+     t12         = _fjsp_sub_v2r8(t12,tL);
+     
+     _fjsp_storel_v2r8(ptrA,  t1);
+     _fjsp_storeh_v2r8(ptrA+1,t1);
+     _fjsp_storel_v2r8(ptrA+2,t2);
+     _fjsp_storeh_v2r8(ptrA+3,t2);
+     _fjsp_storel_v2r8(ptrA+4,t3);
+     _fjsp_storeh_v2r8(ptrA+5,t3);
+     _fjsp_storel_v2r8(ptrA+6,t4);
+     _fjsp_storeh_v2r8(ptrA+7,t4);
+     _fjsp_storel_v2r8(ptrA+8,t5);
+     _fjsp_storeh_v2r8(ptrA+9,t5);
+     _fjsp_storel_v2r8(ptrA+10,t6);
+     _fjsp_storeh_v2r8(ptrA+11,t6);
+     _fjsp_storel_v2r8(ptrB,  t7);
+     _fjsp_storeh_v2r8(ptrB+1,t7);
+     _fjsp_storel_v2r8(ptrB+2,t8);
+     _fjsp_storeh_v2r8(ptrB+3,t8);
+     _fjsp_storel_v2r8(ptrB+4,t9);
+     _fjsp_storeh_v2r8(ptrB+5,t9);
+     _fjsp_storel_v2r8(ptrB+6,t10);
+     _fjsp_storeh_v2r8(ptrB+7,t10);
+     _fjsp_storel_v2r8(ptrB+8,t11);
+     _fjsp_storeh_v2r8(ptrB+9,t11);
+     _fjsp_storel_v2r8(ptrB+10,t12);
+     _fjsp_storeh_v2r8(ptrB+11,t12);
+ }
+ static gmx_inline void
+ gmx_fjsp_update_iforce_1atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
+                                       double * gmx_restrict fptr,
+                                       double * gmx_restrict fshiftptr)
+ {
+     __m128d t1,t2,t3,t4;
+     
+     /* transpose data */
+     t1 = fix1;
+     fix1 = _fjsp_unpacklo_v2r8(fix1,fiy1); /* y0 x0 */
+     fiy1 = _fjsp_unpackhi_v2r8(t1,fiy1);   /* y1 x1 */
+     
+     fix1 = _fjsp_add_v2r8(fix1,fiy1);
+     fiz1 = _fjsp_add_v2r8( fiz1, _fjsp_unpackhi_v2r8(fiz1,fiz1 ));
+     
+     t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
+     _fjsp_storel_v2r8( fptr, t4 );
+     _fjsp_storeh_v2r8( fptr+1, t4 );
+     _fjsp_storel_v2r8( fptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fptr+2), fiz1 ));
+     
+     t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
+     _fjsp_storel_v2r8( fshiftptr, t4 );
+     _fjsp_storeh_v2r8( fshiftptr+1, t4 );
+     _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+ }
+ static gmx_inline void
+ gmx_fjsp_update_iforce_3atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
+                                       _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
+                                       _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
+                                       double * gmx_restrict fptr,
+                                       double * gmx_restrict fshiftptr)
+ {
+     __m128d t1,t2,t3,t4,t5,t6;
+     
+     /* transpose data */
+     GMX_FJSP_TRANSPOSE2_V2R8(fix1,fiy1);
+     GMX_FJSP_TRANSPOSE2_V2R8(fiz1,fix2);
+     GMX_FJSP_TRANSPOSE2_V2R8(fiy2,fiz2);
+     t1 = fix3;
+     fix3 = _fjsp_unpacklo_v2r8(fix3,fiy3); /* y0 x0 */
+     fiy3 = _fjsp_unpackhi_v2r8(t1,fiy3);   /* y1 x1 */
+     
+     fix1 = _fjsp_add_v2r8(fix1,fiy1);
+     fiz1 = _fjsp_add_v2r8(fiz1,fix2);
+     fiy2 = _fjsp_add_v2r8(fiy2,fiz2);
+     
+     fix3 = _fjsp_add_v2r8(fix3,fiy3);
+     fiz3 = _fjsp_add_v2r8( fiz3, _fjsp_unpackhi_v2r8(fiz3,fiz3));
+     
+     t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
+     t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
+     t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
+     t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6), fix3 );
+     _fjsp_storel_v2r8( fptr,   t3 );
+     _fjsp_storeh_v2r8( fptr+1, t3 );
+     _fjsp_storel_v2r8( fptr+2, t4 );
+     _fjsp_storeh_v2r8( fptr+3, t4 );
+     _fjsp_storel_v2r8( fptr+4, t5 );
+     _fjsp_storeh_v2r8( fptr+5, t5 );
+     _fjsp_storel_v2r8( fptr+6, t6 );
+     _fjsp_storeh_v2r8( fptr+7, t6 );
+     _fjsp_storel_v2r8( fptr+8, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fptr+8), fiz3 ));
+     
+     fix1 = _fjsp_add_v2r8(fix1,fix3);
+     t1   = _fjsp_shuffle_v2r8(fiz1,fiy2,GMX_FJSP_SHUFFLE2(0,1));
+     fix1 = _fjsp_add_v2r8(fix1,t1); /* x and y sums */
+     
+     t2   = _fjsp_shuffle_v2r8(fiy2,fiy2,GMX_FJSP_SHUFFLE2(1,1));
+     fiz1 = _fjsp_add_v2r8(fiz1,fiz3);
+     fiz1 = _fjsp_add_v2r8(fiz1,t2); /* z sum */
+     
+     t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
+     _fjsp_storel_v2r8( fshiftptr, t3 );
+     _fjsp_storeh_v2r8( fshiftptr+1, t3 );
+     _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+ }
+ static gmx_inline void
+ gmx_fjsp_update_iforce_4atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
+                                       _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
+                                       _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
+                                       _fjsp_v2r8 fix4, _fjsp_v2r8 fiy4, _fjsp_v2r8 fiz4,
+                                       double * gmx_restrict fptr,
+                                       double * gmx_restrict fshiftptr)
+ {
+     __m128d t1,t2,t3,t4,t5,t6,t7,t8;
+     
+     /* transpose data */
+     GMX_FJSP_TRANSPOSE2_V2R8(fix1,fiy1);
+     GMX_FJSP_TRANSPOSE2_V2R8(fiz1,fix2);
+     GMX_FJSP_TRANSPOSE2_V2R8(fiy2,fiz2);
+     GMX_FJSP_TRANSPOSE2_V2R8(fix3,fiy3);
+     GMX_FJSP_TRANSPOSE2_V2R8(fiz3,fix4);
+     GMX_FJSP_TRANSPOSE2_V2R8(fiy4,fiz4);
+     
+     fix1 = _fjsp_add_v2r8(fix1,fiy1);
+     fiz1 = _fjsp_add_v2r8(fiz1,fix2);
+     fiy2 = _fjsp_add_v2r8(fiy2,fiz2);
+     fix3 = _fjsp_add_v2r8(fix3,fiy3);
+     fiz3 = _fjsp_add_v2r8(fiz3,fix4);
+     fiy4 = _fjsp_add_v2r8(fiy4,fiz4);
+     
+     t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr),    fix1 );
+     t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2),  fiz1 );
+     t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4),  fiy2 );
+     t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6),  fix3 );
+     t7 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+8),  fiz3 );
+     t8 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+10), fiy4 );
+     _fjsp_storel_v2r8( fptr,    t3 );
+     _fjsp_storeh_v2r8( fptr+1,  t3 );
+     _fjsp_storel_v2r8( fptr+2,  t4 );
+     _fjsp_storeh_v2r8( fptr+3,  t4 );
+     _fjsp_storel_v2r8( fptr+4,  t5 );
+     _fjsp_storeh_v2r8( fptr+5,  t5 );
+     _fjsp_storel_v2r8( fptr+6,  t6 );
+     _fjsp_storeh_v2r8( fptr+7,  t6 );
+     _fjsp_storel_v2r8( fptr+8,  t7 );
+     _fjsp_storeh_v2r8( fptr+9,  t7 );
+     _fjsp_storel_v2r8( fptr+10, t8 );
+     _fjsp_storeh_v2r8( fptr+11, t8 );
+     t1 = _fjsp_shuffle_v2r8(fiz1,fiy2,GMX_FJSP_SHUFFLE2(0,1));
+     fix1 = _fjsp_add_v2r8(fix1,t1);
+     t2 = _fjsp_shuffle_v2r8(fiz3,fiy4,GMX_FJSP_SHUFFLE2(0,1));
+     fix3 = _fjsp_add_v2r8(fix3,t2);
+     fix1 = _fjsp_add_v2r8(fix1,fix3); /* x and y sums */
+     
+     fiz1 = _fjsp_add_v2r8(fiz1, _fjsp_unpackhi_v2r8(fiy2,fiy2));
+     fiz3 = _fjsp_add_v2r8(fiz3, _fjsp_unpackhi_v2r8(fiy4,fiy4));
+     fiz1 = _fjsp_add_v2r8(fiz1,fiz3); /* z sum */
+     
+     t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
+     _fjsp_storel_v2r8( fshiftptr, t3 );
+     _fjsp_storeh_v2r8( fshiftptr+1, t3 );
+     _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+ }
+ static gmx_inline void
+ gmx_fjsp_update_1pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA)
+ {
+     pot1 = _fjsp_add_v2r8(pot1, _fjsp_unpackhi_v2r8(pot1,pot1));
+     _fjsp_storel_v2r8(ptrA,_fjsp_add_v2r8(pot1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA)));
+ }
+ static gmx_inline void
+ gmx_fjsp_update_2pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA,
+                       _fjsp_v2r8 pot2, double * gmx_restrict ptrB)
+ {
+     GMX_FJSP_TRANSPOSE2_V2R8(pot1,pot2);
+     pot1 = _fjsp_add_v2r8(pot1,pot2);
+     pot2 = _fjsp_unpackhi_v2r8(pot1,pot1);
+     
+     _fjsp_storel_v2r8(ptrA,_fjsp_add_v2r8(pot1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA)));
+     _fjsp_storel_v2r8(ptrB,_fjsp_add_v2r8(pot2,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB)));
+ }
+ #endif /* _kernelutil_sparc64_hpc_ace_double_h_ */
index 0000000000000000000000000000000000000000,9b723bda3ac6375f661a51935a44f04331c36dff..9b723bda3ac6375f661a51935a44f04331c36dff
mode 000000,100755..100755
--- /dev/null
@@@ -1,0 -1,538 +1,538 @@@
+ #!/usr/bin/python
+ #
+ # This file is part of the GROMACS molecular simulation package.
+ #
+ # Copyright (c) 2012, by the GROMACS development team, led by
+ # David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ # others, as listed in the AUTHORS file in the top-level source
+ # directory and at http://www.gromacs.org.
+ #
+ # GROMACS is free software; you can redistribute it and/or
+ # modify it under the terms of the GNU Lesser General Public License
+ # as published by the Free Software Foundation; either version 2.1
+ # of the License, or (at your option) any later version.
+ #
+ # GROMACS is distributed in the hope that it will be useful,
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ # Lesser General Public License for more details.
+ #
+ # You should have received a copy of the GNU Lesser General Public
+ # License along with GROMACS; if not, see
+ # http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ # Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ #
+ # If you want to redistribute modifications to GROMACS, please
+ # consider that scientific software is very special. Version
+ # control is crucial - bugs must be traceable. We will be happy to
+ # consider code for inclusion in the official distribution, but
+ # derived work must not be called official GROMACS. Details are found
+ # in the README & COPYING files - if they are missing, get the
+ # official version at http://www.gromacs.org.
+ #
+ # To help us fund GROMACS development, we humbly ask that you cite
+ # the research papers on the package. Check out http://www.gromacs.org
+ import sys
+ import os
+ sys.path.append ( "../preprocessor" )
+ from gmxpreprocess import gmxpreprocess
+ # "The happiest programs are programs that write other programs."
+ #
+ #
+ # This script controls the generation of Gromacs nonbonded kernels.
+ #
+ # We no longer generate kernels on-the-fly, so this file is not run
+ # during a Gromacs compile - only when we need to update the kernels (=rarely).
+ #
+ # To maximize performance, each combination of interactions in Gromacs
+ # has a separate nonbonded kernel without conditionals in the code.
+ # To avoid writing hundreds of different routines for each architecture,
+ # we instead use a custom preprocessor so we can encode the conditionals
+ # and expand for-loops (e.g, for water-water interactions)
+ # from a general kernel template. While that file will contain quite a
+ # few preprocessor directives, it is still an order of magnitude easier
+ # to maintain than ~200 different kernels (not to mention it avoids bugs).
+ #
+ # To actually generate the kernels, this program iteratively calls the
+ # preprocessor with different define settings corresponding to all
+ # combinations of coulomb/van-der-Waals/geometry options.
+ #
+ # A main goal in the design was to make this new generator _general_. For
+ # this reason we have used a lot of different fields to identify a particular
+ # kernel and interaction. Basically, each kernel will have a name like
+ #
+ # nbkernel_ElecXX_VdwYY_GeomZZ_VF_QQ()
+ #
+ # Where XX/YY/ZZ/VF are strings to identify what the kernel computes.
+ #
+ # Elec/Vdw describe the type of interaction for electrostatics and van der Waals.
+ # The geometry settings correspond e.g. to water-water or water-particle kernels,
+ # and finally the VF setting is V,F,or VF depending on whether we calculate
+ # only the potential, only the force, or both of them. The final string (QQ)
+ # is the architecture/language/optimization of the kernel.
+ #
+ Arch       = 'sparc64_hpc_ace_double'
+ # Explanation of the 'properties':
+ #
+ # It is cheap to compute r^2, and the kernels require various other functions of r for
+ # different kinds of interaction. Depending on the needs of the kernel and the available
+ # processor instructions, this will be done in different ways.
+ #
+ # 'rinv' means we need 1/r, which is calculated as 1/sqrt(r^2).
+ # 'rinvsq' means we need 1/(r*r). This is calculated as rinv*rinv if we already did rinv, otherwise 1/r^2.
+ # 'r' is similarly calculated as r^2*rinv when needed
+ # 'table' means the interaction is tabulated, in which case we will calculate a table index before the interaction
+ # 'shift' means the interaction will be modified by a constant to make it zero at the cutoff.
+ # 'cutoff' means the interaction is set to 0.0 outside the cutoff
+ #
+ FileHeader = \
+ '/*\n' \
+ ' * This file is part of the GROMACS molecular simulation package.\n' \
+ ' *\n' \
+ ' * Copyright (c) 2012, by the GROMACS development team, led by\n' \
+ ' * David van der Spoel, Berk Hess, Erik Lindahl, and including many\n' \
+ ' * others, as listed in the AUTHORS file in the top-level source\n' \
+ ' * directory and at http://www.gromacs.org.\n' \
+ ' *\n' \
+ ' * GROMACS is free software; you can redistribute it and/or\n' \
+ ' * modify it under the terms of the GNU Lesser General Public License\n' \
+ ' * as published by the Free Software Foundation; either version 2.1\n' \
+ ' * of the License, or (at your option) any later version.\n' \
+ ' *\n' \
+ ' * GROMACS is distributed in the hope that it will be useful,\n' \
+ ' * but WITHOUT ANY WARRANTY; without even the implied warranty of\n' \
+ ' * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n' \
+ ' * Lesser General Public License for more details.\n' \
+ ' *\n' \
+ ' * You should have received a copy of the GNU Lesser General Public\n' \
+ ' * License along with GROMACS; if not, see\n' \
+ ' * http://www.gnu.org/licenses, or write to the Free Software Foundation,\n' \
+ ' * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.\n' \
+ ' *\n' \
+ ' * If you want to redistribute modifications to GROMACS, please\n' \
+ ' * consider that scientific software is very special. Version\n' \
+ ' * control is crucial - bugs must be traceable. We will be happy to\n' \
+ ' * consider code for inclusion in the official distribution, but\n' \
+ ' * derived work must not be called official GROMACS. Details are found\n' \
+ ' * in the README & COPYING files - if they are missing, get the\n' \
+ ' * official version at http://www.gromacs.org.\n' \
+ ' *\n' \
+ ' * To help us fund GROMACS development, we humbly ask that you cite\n' \
+ ' * the research papers on the package. Check out http://www.gromacs.org.\n' \
+ ' */\n' \
+ '/*\n' \
+ ' * Note: this file was generated by the GROMACS '+Arch+' kernel generator.\n' \
+ ' */\n'
+ ###############################################
+ # ELECTROSTATICS
+ # Interactions and flags for them
+ ###############################################
+ ElectrostaticsList = {
+     'None'                    : [],
+     'Coulomb'                 : ['rinv','rinvsq'],
+     'ReactionField'           : ['rinv','rinvsq'],
+     'GeneralizedBorn'         : ['rinv','r'],
+     'CubicSplineTable'        : ['rinv','r','table'],
+     'Ewald'                   : ['rinv','rinvsq','r'],
+ }
+ ###############################################
+ # VAN DER WAALS
+ # Interactions and flags for them
+ ###############################################
+ VdwList = {
+     'None'                    : [],
+     'LennardJones'            : ['rinvsq'],
+ #    'Buckingham'              : ['rinv','rinvsq','r'], # Disabled for sse4.1 to reduce number of kernels and simply the template
+     'CubicSplineTable'        : ['rinv','r','table'],
+ }
+ ###############################################
+ # MODIFIERS
+ # Different ways to adjust/modify interactions to conserve energy
+ ###############################################
+ ModifierList = {
+     'None'                    : [],
+     'ExactCutoff'             : ['exactcutoff'],        # Zero the interaction outside the cutoff, used for reaction-field-zero
+     'PotentialShift'          : ['shift','exactcutoff'],
+     'PotentialSwitch'         : ['rinv','r','switch','exactcutoff']
+ }
+ ###############################################
+ # GEOMETRY COMBINATIONS
+ ###############################################
+ GeometryNameList = [
+     [ 'Particle' , 'Particle' ],
+     [ 'Water3'   , 'Particle' ],
+     [ 'Water3'   , 'Water3'   ],
+     [ 'Water4'   , 'Particle' ],
+     [ 'Water4'   , 'Water4'   ]
+ ]
+ ###############################################
+ # POTENTIAL / FORCE
+ ###############################################
+ VFList = [
+     'PotentialAndForce',
+ # 'Potential',   # Not used yet
+     'Force'
+ ]
+ ###############################################
+ # GEOMETRY PROPERTIES
+ ###############################################
+ # Dictionaries with lists telling which interactions are present
+ # 1,2,3 means particles 1,2,3 (but not 0) have electrostatics!
+ GeometryElectrostatics = {
+     'Particle'  : [ 0 ],
+     'Particle2' : [ 0 , 1 ],
+     'Particle3' : [ 0 , 1 , 2 ],
+     'Particle4' : [ 0 , 1 , 2 , 3 ],
+     'Water3'    : [ 0 , 1 , 2 ],
+     'Water4'    : [ 1 , 2 , 3 ]
+ }
+ GeometryVdw = {
+     'Particle'  : [ 0 ],
+     'Particle2' : [ 0 , 1 ],
+     'Particle3' : [ 0 , 1 , 2 ],
+     'Particle4' : [ 0 , 1 , 2 , 3 ],
+     'Water3'    : [ 0 ],
+     'Water4'    : [ 0 ]
+ }
+ # Dictionary to abbreviate all strings (mixed from all the lists)
+ Abbreviation = {
+     'None'                    : 'None',
+     'Coulomb'                 : 'Coul',
+     'Ewald'                   : 'Ew',
+     'ReactionField'           : 'RF',
+     'GeneralizedBorn'         : 'GB',
+     'CubicSplineTable'        : 'CSTab',
+     'LennardJones'            : 'LJ',
+     'Buckingham'              : 'Bham',
+     'PotentialShift'          : 'Sh',
+     'PotentialSwitch'         : 'Sw',
+     'ExactCutoff'             : 'Cut',
+     'PotentialAndForce'       : 'VF',
+     'Potential'               : 'V',
+     'Force'                   : 'F',
+     'Water3'                  : 'W3',
+     'Water4'                  : 'W4',
+     'Particle'                : 'P1',
+     'Particle2'               : 'P2',
+     'Particle3'               : 'P3',
+     'Particle4'               : 'P4'
+ }
+ ###############################################
+ # Functions
+ ###############################################
+ # Return a string with the kernel name from current settings
+ def MakeKernelFileName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom):
+     ElecStr = 'Elec' + Abbreviation[KernelElec]
+     if(KernelElecMod!='None'):
+         ElecStr = ElecStr + Abbreviation[KernelElecMod]
+     VdwStr  = 'Vdw'  + Abbreviation[KernelVdw]
+     if(KernelVdwMod!='None'):
+         VdwStr = VdwStr + Abbreviation[KernelVdwMod]
+     GeomStr = 'Geom' + Abbreviation[KernelGeom[0]] + Abbreviation[KernelGeom[1]]
+     return 'nb_kernel_' + ElecStr + '_' + VdwStr + '_' + GeomStr + '_' + Arch
+ def MakeKernelName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+     ElecStr = 'Elec' + Abbreviation[KernelElec]
+     if(KernelElecMod!='None'):
+         ElecStr = ElecStr + Abbreviation[KernelElecMod]
+     VdwStr  = 'Vdw'  + Abbreviation[KernelVdw]
+     if(KernelVdwMod!='None'):
+         VdwStr = VdwStr + Abbreviation[KernelVdwMod]
+     GeomStr = 'Geom' + Abbreviation[KernelGeom[0]] + Abbreviation[KernelGeom[1]]
+     VFStr   = Abbreviation[KernelVF]
+     return 'nb_kernel_' + ElecStr + '_' + VdwStr + '_' + GeomStr + '_' + VFStr + '_' + Arch
+ # Return a string with a declaration to use for the kernel;
+ # this will be a sequence of string combinations as well as the actual function name
+ # Dont worry about field widths - that is just pretty-printing for the header!
+ def MakeKernelDecl(KernelName,KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelOther,KernelVF):
+     KernelStr   = '\"'+KernelName+'\"'
+     ArchStr     = '\"'+Arch+'\"'
+     ElecStr     = '\"'+KernelElec+'\"'
+     ElecModStr  = '\"'+KernelElecMod+'\"'
+     VdwStr      = '\"'+KernelVdw+'\"'
+     VdwModStr   = '\"'+KernelVdwMod+'\"'
+     GeomStr     = '\"'+KernelGeom[0]+KernelGeom[1]+'\"'
+     OtherStr    = '\"'+KernelOther+'\"'
+     VFStr       = '\"'+KernelVF+'\"'
+     ThisSpec = ArchStr+', '+ElecStr+', '+ElecModStr+', '+VdwStr+', '+VdwModStr+', '+GeomStr+', '+OtherStr+', '+VFStr
+     ThisDecl = '    { '+KernelName+', '+KernelStr+', '+ThisSpec+' }'
+     return ThisDecl
+ # Returns 1 if this kernel should be created, 0 if we should skip it
+ # This routine is not critical - it is not the end of the world if we create more kernels,
+ # but since the number is pretty large we save both space and compile-time by reducing it a bit.
+ def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+     # No need for kernels without interactions
+     if(KernelElec=='None' and KernelVdw=='None'):
+         return 0
+     # No need for modifiers without interactions
+     if((KernelElec=='None' and KernelElecMod!='None') or (KernelVdw=='None' and KernelVdwMod!='None')):
+         return 0
+     # No need for LJ-only water optimization, or water optimization with implicit solvent.
+     if('Water' in KernelGeom[0] and (KernelElec=='None' or 'GeneralizedBorn' in KernelElec)):
+         return 0
+     # Non-matching table settings are pointless
+     if( ('Table' in KernelElec) and ('Table' in KernelVdw) and KernelElec!=KernelVdw ):
+         return 0
+     # Try to reduce the number of different switch/shift options to get a reasonable number of kernels
+     # For electrostatics, reaction-field can use 'exactcutoff', and ewald can use switch or shift.
+     if(KernelElecMod=='ExactCutoff' and KernelElec!='ReactionField'):
+         return 0
+     if(KernelElecMod in ['PotentialShift','PotentialSwitch'] and KernelElec!='Ewald'):
+         return 0
+     # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
+     if((KernelVdwMod=='ExactCutoff') or
+        (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+         return 0
+     # Choose either switch or shift and don't mix them...
+     if((KernelElecMod=='PotentialShift' and KernelVdwMod=='PotentialSwitch') or
+        (KernelElecMod=='PotentialSwitch' and KernelVdwMod=='PotentialShift')):
+         return 0
+     # Don't use a Vdw kernel with a modifier if the electrostatics one does not have one
+     if(KernelElec!='None' and KernelElecMod=='None' and KernelVdwMod!='None'):
+         return 0
+     # Don't use an electrostatics kernel with a modifier if the vdw one does not have one,
+     # unless the electrostatics one is reaction-field with exact cutoff.
+     if(KernelVdw!='None' and KernelVdwMod=='None' and KernelElecMod!='None'):
+         if(KernelElec=='ReactionField' and KernelVdw!='CubicSplineTable'):
+             return 0
+         elif(KernelElec!='ReactionField'):
+             return 0
+     return 1
+ #
+ # The preprocessor will automatically expand the interactions for water and other
+ # geometries inside the kernel, but to get this right we need to setup a couple
+ # of defines - we do them in a separate routine to keep the main loop clean.
+ #
+ # While this routine might look a bit complex it is actually quite straightforward,
+ # and the best news is that you wont have to modify _anything_ for a new geometry
+ # as long as you correctly define its Electrostatics/Vdw geometry in the lists above!
+ #
+ def SetDefines(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF,defines):
+     # What is the _name_ for the i/j group geometry?
+     igeometry            = KernelGeom[0]
+     jgeometry            = KernelGeom[1]
+     # define so we can access it in the source when the preprocessor runs
+     defines['GEOMETRY_I'] = igeometry
+     defines['GEOMETRY_J'] = jgeometry
+     # For the i/j groups, extract a python list of which sites have electrostatics
+     # For SPC/TIP3p this will be [1,1,1], while TIP4p (no elec on first site) will be [0,1,1,1]
+     ielec                = GeometryElectrostatics[igeometry]
+     jelec                = GeometryElectrostatics[jgeometry]
+     # Zero out the corresponding lists in case we dont do Elec
+     if(KernelElec=='None'):
+         ielec = []
+         jelec = []
+     # Extract similar interaction lists for Vdw interactions (example for SPC: [1,0,0])
+     iVdw                 = GeometryVdw[igeometry]
+     jVdw                 = GeometryVdw[jgeometry]
+     # Zero out the corresponding lists in case we dont do Vdw
+     if(KernelVdw=='None'):
+         iVdw = []
+         jVdw = []
+     # iany[] and jany[] contains lists of the particles actually used (for interactions) in this kernel
+     iany = list(set(ielec+iVdw))  # convert to+from set to make elements unique
+     jany = list(set(jelec+jVdw))
+     defines['PARTICLES_ELEC_I'] = ielec
+     defines['PARTICLES_ELEC_J'] = jelec
+     defines['PARTICLES_VDW_I']  = iVdw
+     defines['PARTICLES_VDW_J']  = jVdw
+     defines['PARTICLES_I']      = iany
+     defines['PARTICLES_J']      = jany
+     # elecij,Vdwij are sets with pairs of particles for which the corresponding interaction is done
+     # (and anyij again corresponds to either electrostatics or Vdw)
+     elecij = []
+     Vdwij  = []
+     anyij  = []
+     for i in ielec:
+         for j in jelec:
+             elecij.append([i,j])
+     for i in iVdw:
+         for j in jVdw:
+             Vdwij.append([i,j])
+     for i in iany:
+         for j in jany:
+             if [i,j] in elecij or [i,j] in Vdwij:
+                 anyij.append([i,j])
+     defines['PAIRS_IJ']     = anyij
+     # Make an 2d list-of-distance-properties-to-calculate for i,j
+     ni = max(iany)+1
+     nj = max(jany)+1
+     # Each element properties[i][j] is an empty list
+     properties = [ [ [] for j in range(0,nj) ] for i in range (0,ni) ]
+     # Add properties to each set
+     for i in range(0,ni):
+         for j in range(0,nj):
+             if [i,j] in elecij:
+                 properties[i][j] = properties[i][j] + ['electrostatics'] + ElectrostaticsList[KernelElec] + ModifierList[KernelElecMod]
+             if [i,j] in Vdwij:
+                 properties[i][j] = properties[i][j] + ['vdw'] + VdwList[KernelVdw] + ModifierList[KernelVdwMod]
+             # Add rinv if we need r
+             if 'r' in properties[i][j]:
+                 properties[i][j] = properties[i][j] + ['rinv']
+             # Add rsq if we need rinv or rinsq
+             if 'rinv' in properties[i][j] or 'rinvsq' in properties[i][j]:
+                 properties[i][j] = properties[i][j] + ['rsq']
+     defines['INTERACTION_FLAGS']    = properties
+ def PrintStatistics(ratio):
+     ratio = 100.0*ratio
+     print '\rGenerating %s nonbonded kernels... %5.1f%%' % (Arch,ratio),
+     sys.stdout.flush()
+ defines = {}
+ kerneldecl = []
+ cnt     = 0.0
+ nelec   = len(ElectrostaticsList)
+ nVdw    = len(VdwList)
+ nmod    = len(ModifierList)
+ ngeom   = len(GeometryNameList)
+ ntot    = nelec*nmod*nVdw*nmod*ngeom
+ numKernels = 0
+ fpdecl = open('nb_kernel_' + Arch + '.c','w')
+ fpdecl.write( FileHeader )
+ fpdecl.write( '#ifndef nb_kernel_' + Arch + '_h\n' )
+ fpdecl.write( '#define nb_kernel_' + Arch + '_h\n\n' )
+ fpdecl.write( '#include "../nb_kernel.h"\n\n' )
+ for KernelElec in ElectrostaticsList:
+     defines['KERNEL_ELEC'] = KernelElec
+     for KernelElecMod in ModifierList:
+         defines['KERNEL_MOD_ELEC'] = KernelElecMod
+         for KernelVdw in VdwList:
+             defines['KERNEL_VDW'] = KernelVdw
+             for KernelVdwMod in ModifierList:
+                 defines['KERNEL_MOD_VDW'] = KernelVdwMod
+                 for KernelGeom in GeometryNameList:
+                     cnt += 1
+                     KernelFilename = MakeKernelFileName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom) + '.c'
+                     fpkernel = open(KernelFilename,'w')
+                     defines['INCLUDE_HEADER'] = 1  # Include header first time in new file
+                     DoHeader = 1
+                     for KernelVF in VFList:
+                         KernelName = MakeKernelName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF)
+                         defines['KERNEL_NAME'] = KernelName
+                         defines['KERNEL_VF']   = KernelVF
+                         # Check if this is a valid/sane/usable combination
+                         if not KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+                             continue;
+                         # The overall kernel settings determine what the _kernel_ calculates, but for the water
+                         # kernels this does not mean that every pairwise interaction has e.g. Vdw interactions.
+                         # This routine sets defines of what to calculate for each pair of particles in those cases.
+                         SetDefines(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF,defines)
+                         if(DoHeader==1):
+                             fpkernel.write( FileHeader )
+                         gmxpreprocess('nb_kernel_template_' + Arch + '.pre', KernelName+'.tmp' , defines, force=1,contentType='C')
+                         numKernels = numKernels + 1
+                         defines['INCLUDE_HEADER'] = 0   # Header has been included once now
+                         DoHeader=0
+                         # Append temp file contents to the common kernelfile
+                         fptmp = open(KernelName+'.tmp','r')
+                         fpkernel.writelines(fptmp.readlines())
+                         fptmp.close()
+                         os.remove(KernelName+'.tmp')
+                         # Add a declaration for this kernel
+                         fpdecl.write('nb_kernel_t ' + KernelName + ';\n');
+                         # Add declaration to the buffer
+                         KernelOther=''
+                         kerneldecl.append(MakeKernelDecl(KernelName,KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelOther,KernelVF))
+                     filesize = fpkernel.tell()
+                     fpkernel.close()
+                     if(filesize==0):
+                         os.remove(KernelFilename)
+                     PrintStatistics(cnt/ntot)
+                 pass
+             pass
+         pass
+     pass
+ pass
+ # Write out the list of settings and corresponding kernels to the declaration file
+ fpdecl.write( '\n\n' )
+ fpdecl.write( 'nb_kernel_info_t\n' )
+ fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+ fpdecl.write( '{\n' )
+ for decl in kerneldecl[0:-1]:
+     fpdecl.write( decl + ',\n' )
+ fpdecl.write( kerneldecl[-1] + '\n' )
+ fpdecl.write( '};\n\n' )
+ fpdecl.write( 'int\n' )
+ fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+ fpdecl.write( '#endif\n')
+ fpdecl.close()
index 0000000000000000000000000000000000000000,4b3773de5368ef5848f9b447cdc753de90547acf..4b3773de5368ef5848f9b447cdc753de90547acf
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,711 +1,711 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 76 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 76 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 9 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*76);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 64 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 64 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*64);
+ }
index 0000000000000000000000000000000000000000,66d4fc922213439e2d02aeca5df85a0ffe9770a2..66d4fc922213439e2d02aeca5df85a0ffe9770a2
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1173 +1,1173 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 171 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 171 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*171);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 151 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 151 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*151);
+ }
index 0000000000000000000000000000000000000000,db19a1750e29e44a6c27fc386e37a267d0bcc5c5..db19a1750e29e44a6c27fc386e37a267d0bcc5c5
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2311 +1,2311 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r01,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq01,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r02,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq02,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq11,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq12,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq21,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq22,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 444 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r01,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq01,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r02,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq02,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq11,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq12,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq21,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq22,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 444 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*444);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r01,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r02,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 400 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r01,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r02,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 400 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*400);
+ }
index 0000000000000000000000000000000000000000,70be5c835801a9523aaa9df8c98e9506437ebe97..70be5c835801a9523aaa9df8c98e9506437ebe97
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1329 +1,1329 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r30,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq30,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 200 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r30,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq30,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 200 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*200);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r30,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 180 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r30,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 180 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*180);
+ }
index 0000000000000000000000000000000000000000,7ef236b1880077c1b380eac331974f691374542d..7ef236b1880077c1b380eac331974f691374542d
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2479 +1,2479 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq11,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq12,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r13,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq13,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq21,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq22,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r23,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq23,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r31,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq31,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r32,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq32,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r33,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq33,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 476 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq11,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq12,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r13,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq13,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq21,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq22,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r23,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq23,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r31,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq31,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r32,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq32,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r33,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq33,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 476 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*476);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r13,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r23,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r31,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r32,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r33,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 432 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r13,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r23,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r31,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r32,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r33,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 432 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*432);
+ }
index 0000000000000000000000000000000000000000,c78c355be4d58ff5b8349e2ece71cf108ded16ea..c78c355be4d58ff5b8349e2ece71cf108ded16ea
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,635 +1,635 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 59 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 59 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 9 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*59);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 50 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 50 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*50);
+ }
index 0000000000000000000000000000000000000000,602513b71edc292a04b7dc115ee47b151da59d3d..602513b71edc292a04b7dc115ee47b151da59d3d
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1097 +1,1097 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 154 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 154 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*154);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 137 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 137 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*137);
+ }
index 0000000000000000000000000000000000000000,18e4b9a830ccf1d15d62b3f30da8155577add38b..18e4b9a830ccf1d15d62b3f30da8155577add38b
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2235 +1,2235 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r01,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq01,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r02,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq02,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq11,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq12,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq21,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq22,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 427 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r01,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq01,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r02,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq02,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq11,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq12,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq21,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq22,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 427 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*427);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r01,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r02,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 386 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r01,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r02,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 386 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*386);
+ }
index 0000000000000000000000000000000000000000,635418cf3edfbde68f7a0c17e8b7c9a538d204bf..635418cf3edfbde68f7a0c17e8b7c9a538d204bf
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1201 +1,1201 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r30,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq30,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 176 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r30,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq30,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 176 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*176);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r30,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 159 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r30,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 159 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*159);
+ }
index 0000000000000000000000000000000000000000,4c49563acb199c0aa07b7de22ca6a63b1e339a0f..4c49563acb199c0aa07b7de22ca6a63b1e339a0f
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2351 +1,2351 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq11,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq12,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r13,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq13,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq21,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq22,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r23,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq23,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r31,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq31,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r32,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq32,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r33,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq33,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 452 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq11,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq12,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r13,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq13,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq21,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq22,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r23,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq23,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r31,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq31,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r32,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq32,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r33,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq33,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 452 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*452);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r13,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r23,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r31,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r32,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r33,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 411 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r13,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r23,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r31,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r32,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r33,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 411 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*411);
+ }
index 0000000000000000000000000000000000000000,d12f4a4acdb120e8e07c2ac4f5d7ecab5d440c1e..d12f4a4acdb120e8e07c2ac4f5d7ecab5d440c1e
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,564 +1,564 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 46 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 46 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 8 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*46);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 42 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 42 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*42);
+ }
index 0000000000000000000000000000000000000000,6f9389556170b9db66a123e2111ac4ce8f796155..6f9389556170b9db66a123e2111ac4ce8f796155
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1026 +1,1026 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            None
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 141 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 141 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*141);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            None
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 129 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 129 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*129);
+ }
index 0000000000000000000000000000000000000000,d5e0083aad213b8ba3f14a83358393c6696433b1..d5e0083aad213b8ba3f14a83358393c6696433b1
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2170 +1,2170 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            None
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r01,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq01,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r02,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq02,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq11,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq12,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq21,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq22,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 414 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq00,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r01,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq01,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r02,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq02,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq11,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq12,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq21,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq22,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 414 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*414);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            None
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r01,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r02,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 378 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r01,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r02,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 378 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*378);
+ }
index 0000000000000000000000000000000000000000,92809fa78ebd4345105876d7452ae4fa38d498ab..92809fa78ebd4345105876d7452ae4fa38d498ab
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1026 +1,1026 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            None
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r30,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq30,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 141 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq10,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq20,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r30,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq30,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 141 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*141);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            None
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r30,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 129 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r10,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r20,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r30,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 129 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*129);
+ }
index 0000000000000000000000000000000000000000,c865db23b78e247080c0382f8231c0da1d2453b6..c865db23b78e247080c0382f8231c0da1d2453b6
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2170 +1,2170 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            None
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq11,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq12,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r13,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq13,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq21,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq22,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r23,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq23,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r31,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq31,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r32,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq32,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r33,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq33,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 414 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq11,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq12,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r13,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq13,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq21,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq22,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r23,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq23,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r31,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq31,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r32,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq32,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r33,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq33,VV);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 414 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*414);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: CubicSplineTable
+  * VdW interaction:            None
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r13,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r23,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r31,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r32,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r33,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 378 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r11,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r12,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r13,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r21,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r22,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r23,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r31,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r32,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r33,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 378 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*378);
+ }
index 0000000000000000000000000000000000000000,6acc0a7e731814efeb6b59ee50b3662c8ff6420e..6acc0a7e731814efeb6b59ee50b3662c8ff6420e
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,679 +1,679 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 66 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 66 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 9 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*66);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 57 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 57 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*57);
+ }
index 0000000000000000000000000000000000000000,c48a7555b5c42d7d894991c743f3e7fb1592ccc0..c48a7555b5c42d7d894991c743f3e7fb1592ccc0
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,989 +1,989 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 131 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 131 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*131);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 120 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 120 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*120);
+ }
index 0000000000000000000000000000000000000000,3786c761a30ccc27f998666cc57012bfe5e5696e..3786c761a30ccc27f998666cc57012bfe5e5696e
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1671 +1,1671 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,rinv01);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,rinv02);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 314 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,rinv01);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,rinv02);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 314 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*314);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,rinv01);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,rinv02);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 297 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,rinv01);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,rinv02);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 297 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*297);
+ }
index 0000000000000000000000000000000000000000,d299daf5db7f014b9d1f3c861320b5133605506e..d299daf5db7f014b9d1f3c861320b5133605506e
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1097 +1,1097 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,rinv30);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 155 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,rinv30);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 155 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*155);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,rinv30);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 144 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,rinv30);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 144 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*144);
+ }
index 0000000000000000000000000000000000000000,54f9d59a45bc316935dd9d97578f15c3db68e9d1..54f9d59a45bc316935dd9d97578f15c3db68e9d1
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1791 +1,1791 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,rinv13);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,rinv23);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,rinv31);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,rinv32);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,rinv33);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 341 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,rinv13);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,rinv23);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,rinv31);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,rinv32);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,rinv33);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 341 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*341);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,rinv13);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,rinv23);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,rinv31);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,rinv32);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,rinv33);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 324 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,rinv13);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,rinv23);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,rinv31);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,rinv32);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,rinv33);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 324 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*324);
+ }
index 0000000000000000000000000000000000000000,c5950bceb85fc4ecd7281e7396f0e726029eae04..c5950bceb85fc4ecd7281e7396f0e726029eae04
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,545 +1,545 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 43 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 43 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 9 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*43);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 37 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 37 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*37);
+ }
index 0000000000000000000000000000000000000000,83f4c9b4898712e6c4adb918793ade8b517d41b3..83f4c9b4898712e6c4adb918793ade8b517d41b3
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,855 +1,855 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 108 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 108 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*108);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 100 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 100 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*100);
+ }
index 0000000000000000000000000000000000000000,56f53a17f914e1af4ba6044e67e6b406f5da6b1f..56f53a17f914e1af4ba6044e67e6b406f5da6b1f
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1537 +1,1537 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,rinv01);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,rinv02);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 291 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,rinv01);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,rinv02);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 291 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*291);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,rinv01);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,rinv02);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 277 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,rinv01);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,rinv02);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 277 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
+ }
index 0000000000000000000000000000000000000000,a52b05d0ec9205880502c449127367a79106f692..a52b05d0ec9205880502c449127367a79106f692
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,963 +1,963 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,rinv30);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 131 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,rinv30);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 131 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*131);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,rinv30);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 123 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,rinv30);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 123 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*123);
+ }
index 0000000000000000000000000000000000000000,44e358095ce86564ab2875914e99c0c22ba89bc2..44e358095ce86564ab2875914e99c0c22ba89bc2
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1657 +1,1657 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,rinv13);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,rinv23);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,rinv31);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,rinv32);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,rinv33);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 317 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,rinv13);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,rinv23);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,rinv31);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,rinv32);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,rinv33);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 317 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*317);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,rinv13);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,rinv23);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,rinv31);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,rinv32);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,rinv33);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 303 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,rinv13);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,rinv23);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,rinv31);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,rinv32);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,rinv33);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 303 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
+ }
index 0000000000000000000000000000000000000000,d3b90c8d07fa7deeaa4ef7050cd621c97e94fdfe..d3b90c8d07fa7deeaa4ef7050cd621c97e94fdfe
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,482 +1,482 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 31 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 31 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 8 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*31);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 30 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 30 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*30);
+ }
index 0000000000000000000000000000000000000000,dbcbeab13e39e21a3a9fed557d9a1f1a588e84e2..dbcbeab13e39e21a3a9fed557d9a1f1a588e84e2
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,792 +1,792 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            None
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 96 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 96 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*96);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            None
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 93 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 93 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*93);
+ }
index 0000000000000000000000000000000000000000,55fe11657772bd21eacd8af847d1065923288189..55fe11657772bd21eacd8af847d1065923288189
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1480 +1,1480 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            None
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,rinv01);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,rinv02);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 279 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,rinv01);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,rinv02);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 279 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*279);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            None
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,rinv01);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,rinv02);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 270 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,rinv01);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,rinv02);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 270 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*270);
+ }
index 0000000000000000000000000000000000000000,faa314c1b5edbab7e402a9ad05809fef9da0e058..faa314c1b5edbab7e402a9ad05809fef9da0e058
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,792 +1,792 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            None
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,rinv30);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 96 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,rinv30);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 96 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*96);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            None
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,rinv30);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 93 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,rinv10);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,rinv20);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,rinv30);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 93 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*93);
+ }
index 0000000000000000000000000000000000000000,51d8be897abd03e218b5a32e2026a23d17f6678a..51d8be897abd03e218b5a32e2026a23d17f6678a
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1480 +1,1480 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            None
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,rinv13);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,rinv23);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,rinv31);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,rinv32);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,rinv33);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 279 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,rinv13);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,rinv23);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,rinv31);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,rinv32);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,rinv33);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 279 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*279);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Coulomb
+  * VdW interaction:            None
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,rinv13);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,rinv23);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,rinv31);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,rinv32);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,rinv33);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 270 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,rinv11);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,rinv12);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,rinv13);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,rinv21);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,rinv22);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,rinv23);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,rinv31);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,rinv32);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,rinv33);
+             felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 270 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*270);
+ }
index 0000000000000000000000000000000000000000,c6d3ca7a2b035780b601092c5ba65cfe696436e1..c6d3ca7a2b035780b601092c5ba65cfe696436e1
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,672 +1,672 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 67 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 67 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 9 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*67);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 49 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 49 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*49);
+ }
index 0000000000000000000000000000000000000000,e9d6af0fa2b647860496bfce62648f6bd29dd8f1..e9d6af0fa2b647860496bfce62648f6bd29dd8f1
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1168 +1,1168 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 168 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 168 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*168);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 136 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 136 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*136);
+ }
index 0000000000000000000000000000000000000000,7f31d057d220542191d7035298daed4dfaed78ad..7f31d057d220542191d7035298daed4dfaed78ad
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2408 +1,2408 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 459 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 459 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*459);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 385 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 385 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*385);
+ }
index 0000000000000000000000000000000000000000,0e61a060b76043ab2cc540969668fe75a69ae8b9..0e61a060b76043ab2cc540969668fe75a69ae8b9
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1312 +1,1312 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv30,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 194 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv30,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 194 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*194);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 162 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 162 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*162);
+ }
index 0000000000000000000000000000000000000000,54aa1f9b0378a0caff66f423ccd5d5eecb6d907d..54aa1f9b0378a0caff66f423ccd5d5eecb6d907d
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2564 +1,2564 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 488 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 488 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*488);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 414 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 414 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*414);
+ }
index 0000000000000000000000000000000000000000,29c0330567f9e0c98d8bff561f152201d1295ef9..29c0330567f9e0c98d8bff561f152201d1295ef9
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,599 +1,599 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 49 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 49 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 8 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*49);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 42 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 42 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*42);
+ }
index 0000000000000000000000000000000000000000,2e684635fa00be90209de6a3de880a6ac8fed2e1..2e684635fa00be90209de6a3de880a6ac8fed2e1
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1095 +1,1095 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 150 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 150 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*150);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 129 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 129 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*129);
+ }
index 0000000000000000000000000000000000000000,9f2d5aeec8f0396d2746dff35a73d7f73ebbe978..9f2d5aeec8f0396d2746dff35a73d7f73ebbe978
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2341 +1,2341 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 441 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 441 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*441);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 378 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 378 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*378);
+ }
index 0000000000000000000000000000000000000000,8c18557e630086bd6fe9ea961c88c5ed92536a9c..8c18557e630086bd6fe9ea961c88c5ed92536a9c
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1095 +1,1095 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv30,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 150 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv30,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 150 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*150);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 129 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 129 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*129);
+ }
index 0000000000000000000000000000000000000000,b89e62bf11da3d44df8a139204cfc3482ac00b22..b89e62bf11da3d44df8a139204cfc3482ac00b22
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2341 +1,2341 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 441 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 441 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*441);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 378 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 378 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*378);
+ }
index 0000000000000000000000000000000000000000,0fb4e0584337702221c896d934d690595c0ae86b..0fb4e0584337702221c896d934d690595c0ae86b
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,759 +1,759 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 86 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 86 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 9 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*86);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 80 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 80 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*80);
+ }
index 0000000000000000000000000000000000000000,9fd2aff6ecfd14bb8c14a4aec12ff02c6119b840..9fd2aff6ecfd14bb8c14a4aec12ff02c6119b840
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1365 +1,1365 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 225 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 225 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*225);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 213 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 213 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*213);
+ }
index 0000000000000000000000000000000000000000,4f7cf4cccf40f808dca9ecf5a1489e4616d3ebef..4f7cf4cccf40f808dca9ecf5a1489e4616d3ebef
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2935 +1,2935 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             d                = _fjsp_sub_v2r8(r01,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             d                = _fjsp_sub_v2r8(r02,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 630 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             d                = _fjsp_sub_v2r8(r01,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             d                = _fjsp_sub_v2r8(r02,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 630 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*630);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             d                = _fjsp_sub_v2r8(r01,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             d                = _fjsp_sub_v2r8(r02,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 600 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             d                = _fjsp_sub_v2r8(r01,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             d                = _fjsp_sub_v2r8(r02,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 600 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*600);
+ }
index 0000000000000000000000000000000000000000,1a8925d222afbd6af0f2c64994f9b626a5c1ee14..1a8925d222afbd6af0f2c64994f9b626a5c1ee14
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1557 +1,1557 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             d                = _fjsp_sub_v2r8(r30,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 269 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             d                = _fjsp_sub_v2r8(r30,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 269 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*269);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             d                = _fjsp_sub_v2r8(r30,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 257 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             d                = _fjsp_sub_v2r8(r30,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 257 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*257);
+ }
index 0000000000000000000000000000000000000000,d76393f208796bbf27588e5be1a8db4438441d75..d76393f208796bbf27588e5be1a8db4438441d75
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,3139 +1,3139 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             d                = _fjsp_sub_v2r8(r13,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             d                = _fjsp_sub_v2r8(r23,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             d                = _fjsp_sub_v2r8(r31,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             d                = _fjsp_sub_v2r8(r32,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             d                = _fjsp_sub_v2r8(r33,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 677 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             d                = _fjsp_sub_v2r8(r13,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             d                = _fjsp_sub_v2r8(r23,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             d                = _fjsp_sub_v2r8(r31,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             d                = _fjsp_sub_v2r8(r32,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             d                = _fjsp_sub_v2r8(r33,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 677 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*677);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             d                = _fjsp_sub_v2r8(r13,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             d                = _fjsp_sub_v2r8(r23,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             d                = _fjsp_sub_v2r8(r31,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             d                = _fjsp_sub_v2r8(r32,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             d                = _fjsp_sub_v2r8(r33,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 647 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             d                = _fjsp_sub_v2r8(r13,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             d                = _fjsp_sub_v2r8(r23,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             d                = _fjsp_sub_v2r8(r31,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             d                = _fjsp_sub_v2r8(r32,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             d                = _fjsp_sub_v2r8(r33,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 647 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*647);
+ }
index 0000000000000000000000000000000000000000,898dec9c97b5e324c963cc860d880c6045e07e0c..898dec9c97b5e324c963cc860d880c6045e07e0c
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,682 +1,682 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 68 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 68 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 8 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*68);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 65 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 65 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*65);
+ }
index 0000000000000000000000000000000000000000,e14474f1e2fccd125ff33ea4349effef90736d69..e14474f1e2fccd125ff33ea4349effef90736d69
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1288 +1,1288 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 207 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 207 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*207);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 198 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 198 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*198);
+ }
index 0000000000000000000000000000000000000000,a176d61a4b227f9fe396d40627f75af24791f10e..a176d61a4b227f9fe396d40627f75af24791f10e
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2864 +1,2864 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             d                = _fjsp_sub_v2r8(r01,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             d                = _fjsp_sub_v2r8(r02,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 612 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             d                = _fjsp_sub_v2r8(r01,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             d                = _fjsp_sub_v2r8(r02,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 612 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*612);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             d                = _fjsp_sub_v2r8(r01,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             d                = _fjsp_sub_v2r8(r02,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 585 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             d                = _fjsp_sub_v2r8(r01,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             d                = _fjsp_sub_v2r8(r02,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 585 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*585);
+ }
index 0000000000000000000000000000000000000000,ff1300a8d96526cfec5ee13be949405c01368e5a..ff1300a8d96526cfec5ee13be949405c01368e5a
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1288 +1,1288 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             d                = _fjsp_sub_v2r8(r30,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 207 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             d                = _fjsp_sub_v2r8(r30,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 207 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*207);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             d                = _fjsp_sub_v2r8(r30,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 198 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             d                = _fjsp_sub_v2r8(r10,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             d                = _fjsp_sub_v2r8(r20,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             d                = _fjsp_sub_v2r8(r30,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 198 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*198);
+ }
index 0000000000000000000000000000000000000000,66de2dde34a0358d7facbf2b0ffc5b21925452b2..66de2dde34a0358d7facbf2b0ffc5b21925452b2
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2864 +1,2864 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             d                = _fjsp_sub_v2r8(r13,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             d                = _fjsp_sub_v2r8(r23,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             d                = _fjsp_sub_v2r8(r31,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             d                = _fjsp_sub_v2r8(r32,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             d                = _fjsp_sub_v2r8(r33,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 612 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             d                = _fjsp_sub_v2r8(r13,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             d                = _fjsp_sub_v2r8(r23,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             d                = _fjsp_sub_v2r8(r31,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             d                = _fjsp_sub_v2r8(r32,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             d                = _fjsp_sub_v2r8(r33,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 612 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*612);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             d                = _fjsp_sub_v2r8(r13,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             d                = _fjsp_sub_v2r8(r23,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             d                = _fjsp_sub_v2r8(r31,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             d                = _fjsp_sub_v2r8(r32,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             d                = _fjsp_sub_v2r8(r33,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 585 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             d                = _fjsp_sub_v2r8(r11,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             d                = _fjsp_sub_v2r8(r12,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             d                = _fjsp_sub_v2r8(r13,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             d                = _fjsp_sub_v2r8(r21,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             d                = _fjsp_sub_v2r8(r22,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             d                = _fjsp_sub_v2r8(r23,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             d                = _fjsp_sub_v2r8(r31,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             d                = _fjsp_sub_v2r8(r32,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             d                = _fjsp_sub_v2r8(r33,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 585 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*585);
+ }
index 0000000000000000000000000000000000000000,3f30f96c89670843af7d6eb6aeb2d756a81b0cc6..3f30f96c89670843af7d6eb6aeb2d756a81b0cc6
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,740 +1,740 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 78 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 78 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 9 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*78);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 65 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 65 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*65);
+ }
index 0000000000000000000000000000000000000000,2ebbb0dcdfea982c67beb48bbc1c10a6301b4903..2ebbb0dcdfea982c67beb48bbc1c10a6301b4903
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1160 +1,1160 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 169 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 169 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*169);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 146 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 146 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*146);
+ }
index 0000000000000000000000000000000000000000,6ab689af2ba49842873258cf83b165b7f0f555f5..6ab689af2ba49842873258cf83b165b7f0f555f5
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2172 +1,2172 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 430 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 430 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*430);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 377 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 377 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*377);
+ }
index 0000000000000000000000000000000000000000,554c18d6d70bb374e82e6d047b120f3576f936e9..554c18d6d70bb374e82e6d047b120f3576f936e9
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1276 +1,1276 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 194 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 194 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*194);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 171 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 171 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*171);
+ }
index 0000000000000000000000000000000000000000,63d5a1837fab697816f7460d9675fdcdf4a0c68f..63d5a1837fab697816f7460d9675fdcdf4a0c68f
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2300 +1,2300 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 458 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 458 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*458);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 405 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 405 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*405);
+ }
index 0000000000000000000000000000000000000000,eb3a7a1be8f173d85da7f2766b6e0d15dab4eb89..eb3a7a1be8f173d85da7f2766b6e0d15dab4eb89
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,614 +1,614 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 56 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 56 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 9 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*56);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 46 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 46 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*46);
+ }
index 0000000000000000000000000000000000000000,b82d07d30be7adcdf3b34339a9703d1ce3d70d1c..b82d07d30be7adcdf3b34339a9703d1ce3d70d1c
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1034 +1,1034 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 147 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 147 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*147);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 127 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 127 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*127);
+ }
index 0000000000000000000000000000000000000000,a4d4afe206d61f9577cb0f8f27745a110375db3a..a4d4afe206d61f9577cb0f8f27745a110375db3a
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2046 +1,2046 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 408 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 408 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*408);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 358 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 358 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*358);
+ }
index 0000000000000000000000000000000000000000,c21ef40e2d4722cc96809281f154f8e63f8fdd18..c21ef40e2d4722cc96809281f154f8e63f8fdd18
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1142 +1,1142 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 170 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 170 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*170);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 150 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 150 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*150);
+ }
index 0000000000000000000000000000000000000000,faa0f8ee336c354c7efe1a16e4effcae8868f4e1..faa0f8ee336c354c7efe1a16e4effcae8868f4e1
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2166 +1,2166 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 434 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 434 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*434);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 384 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 384 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*384);
+ }
index 0000000000000000000000000000000000000000,a26a79ba9f84b31c5821c7595227783fa62a0e6e..a26a79ba9f84b31c5821c7595227783fa62a0e6e
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,551 +1,551 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 44 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 44 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 8 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*44);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 39 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 39 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*39);
+ }
index 0000000000000000000000000000000000000000,9716ad04850ece971e4fa7cf35fb9c8b25ef71de..9716ad04850ece971e4fa7cf35fb9c8b25ef71de
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,971 +1,971 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 135 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 135 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*135);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 120 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 120 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*120);
+ }
index 0000000000000000000000000000000000000000,91814d3d7d763ea8a2628c8671f0381a5f8825fe..91814d3d7d763ea8a2628c8671f0381a5f8825fe
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1989 +1,1989 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 396 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 396 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*396);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 351 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 351 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*351);
+ }
index 0000000000000000000000000000000000000000,c78756658fb882a73730177e40a1c5736350f8fb..c78756658fb882a73730177e40a1c5736350f8fb
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,971 +1,971 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 135 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 135 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*135);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 120 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 120 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*120);
+ }
index 0000000000000000000000000000000000000000,24cbc9a362a799f1c602a7da0c27ad54a69078dd..24cbc9a362a799f1c602a7da0c27ad54a69078dd
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1989 +1,1989 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 396 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             ewtabD           = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             ewtabFn          = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 396 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*396);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: Ewald
+  * VdW interaction:            None
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 351 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 351 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*351);
+ }
index 0000000000000000000000000000000000000000,93a7338e54827dd912358629af1e4a4aa6761089..93a7338e54827dd912358629af1e4a4aa6761089
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,820 +1,820 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: GeneralizedBorn
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+     _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+     real             *invsqrta,*dvda,*gbtab;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     invsqrta         = fr->invsqrta;
+     dvda             = fr->dvda;
+     gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+     gbtab            = fr->gbtab.data;
+     gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vgbsum           = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         dvdasum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+             isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+             gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+             gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+             /* Calculate generalized born table index - this is a separate table from the normal one,
+              * but we use the same procedure by multiplying r with scale and truncating to integer.
+              */
+             rt               = _fjsp_mul_v2r8(r00,gbscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+             _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+             Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+             F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+             H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+             vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+             twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+             fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+             dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+             dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+             gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 95 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+             isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+             gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+             gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+             /* Calculate generalized born table index - this is a separate table from the normal one,
+              * but we use the same procedure by multiplying r with scale and truncating to integer.
+              */
+             rt               = _fjsp_mul_v2r8(r00,gbscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+             _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+             Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+             vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+             twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+             fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+             dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+             dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+             gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vgb              = _fjsp_unpacklo_v2r8(vgb,_fjsp_setzero_v2r8());
+             vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 95 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vgbsum,kernel_data->energygrp_polarization+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+         gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 10 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*95);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: GeneralizedBorn
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+     _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+     real             *invsqrta,*dvda,*gbtab;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     invsqrta         = fr->invsqrta;
+     dvda             = fr->dvda;
+     gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+     gbtab            = fr->gbtab.data;
+     gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         dvdasum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+             isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+             gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+             gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+             /* Calculate generalized born table index - this is a separate table from the normal one,
+              * but we use the same procedure by multiplying r with scale and truncating to integer.
+              */
+             rt               = _fjsp_mul_v2r8(r00,gbscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+             _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+             Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+             F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+             H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+             vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+             twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+             fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+             dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+             dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+             gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 85 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+             isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+             gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+             gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+             /* Calculate generalized born table index - this is a separate table from the normal one,
+              * but we use the same procedure by multiplying r with scale and truncating to integer.
+              */
+             rt               = _fjsp_mul_v2r8(r00,gbscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+             _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+             Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+             vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+             twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+             fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+             dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+             dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+             gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 85 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+         gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*85);
+ }
index 0000000000000000000000000000000000000000,0a585348598828cf96ad59200510743e661faaee..0a585348598828cf96ad59200510743e661faaee
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,706 +1,706 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: GeneralizedBorn
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+     _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+     real             *invsqrta,*dvda,*gbtab;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     invsqrta         = fr->invsqrta;
+     dvda             = fr->dvda;
+     gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+     gbtab            = fr->gbtab.data;
+     gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vgbsum           = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         dvdasum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+             isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+             gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+             gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+             /* Calculate generalized born table index - this is a separate table from the normal one,
+              * but we use the same procedure by multiplying r with scale and truncating to integer.
+              */
+             rt               = _fjsp_mul_v2r8(r00,gbscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+             _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+             Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+             F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+             H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+             vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+             twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+             fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+             dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+             dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+             gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 74 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+             isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+             gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+             gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+             /* Calculate generalized born table index - this is a separate table from the normal one,
+              * but we use the same procedure by multiplying r with scale and truncating to integer.
+              */
+             rt               = _fjsp_mul_v2r8(r00,gbscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+             _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+             Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+             vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+             twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+             fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+             dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+             dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+             gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vgb              = _fjsp_unpacklo_v2r8(vgb,_fjsp_setzero_v2r8());
+             vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 74 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vgbsum,kernel_data->energygrp_polarization+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+         gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 10 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*74);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: GeneralizedBorn
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+     _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+     real             *invsqrta,*dvda,*gbtab;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     invsqrta         = fr->invsqrta;
+     dvda             = fr->dvda;
+     gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+     gbtab            = fr->gbtab.data;
+     gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         dvdasum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+             isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+             gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+             gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+             /* Calculate generalized born table index - this is a separate table from the normal one,
+              * but we use the same procedure by multiplying r with scale and truncating to integer.
+              */
+             rt               = _fjsp_mul_v2r8(r00,gbscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+             _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+             Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+             F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+             H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+             vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+             twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+             fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+             dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+             dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+             gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 67 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+             isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+             gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+             gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+             /* Calculate generalized born table index - this is a separate table from the normal one,
+              * but we use the same procedure by multiplying r with scale and truncating to integer.
+              */
+             rt               = _fjsp_mul_v2r8(r00,gbscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+             _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+             Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+             vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+             twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+             fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+             dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+             dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+             gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 67 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+         gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*67);
+ }
index 0000000000000000000000000000000000000000,bc2317e750b0c75bf5d1e68399f5572dbdac023b..bc2317e750b0c75bf5d1e68399f5572dbdac023b
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,635 +1,635 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: GeneralizedBorn
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+     _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+     real             *invsqrta,*dvda,*gbtab;
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     invsqrta         = fr->invsqrta;
+     dvda             = fr->dvda;
+     gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+     gbtab            = fr->gbtab.data;
+     gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vgbsum           = _fjsp_setzero_v2r8();
+         dvdasum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+             isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+             gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+             gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+             /* Calculate generalized born table index - this is a separate table from the normal one,
+              * but we use the same procedure by multiplying r with scale and truncating to integer.
+              */
+             rt               = _fjsp_mul_v2r8(r00,gbscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+             _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+             Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+             F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+             H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+             vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+             twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+             fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+             dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+             dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+             gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 61 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+             isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+             gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+             gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+             /* Calculate generalized born table index - this is a separate table from the normal one,
+              * but we use the same procedure by multiplying r with scale and truncating to integer.
+              */
+             rt               = _fjsp_mul_v2r8(r00,gbscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+             _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+             Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+             vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+             twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+             fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+             dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+             dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+             gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vgb              = _fjsp_unpacklo_v2r8(vgb,_fjsp_setzero_v2r8());
+             vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 61 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vgbsum,kernel_data->energygrp_polarization+ggid);
+         dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+         gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 9 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*61);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: GeneralizedBorn
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+     _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+     real             *invsqrta,*dvda,*gbtab;
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     invsqrta         = fr->invsqrta;
+     dvda             = fr->dvda;
+     gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+     gbtab            = fr->gbtab.data;
+     gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+         dvdasum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+             isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+             gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+             gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+             /* Calculate generalized born table index - this is a separate table from the normal one,
+              * but we use the same procedure by multiplying r with scale and truncating to integer.
+              */
+             rt               = _fjsp_mul_v2r8(r00,gbscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+             _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+             Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+             F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+             H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+             vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+             twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+             fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+             dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+             dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+             gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 59 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+             isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+             gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+             gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+             /* Calculate generalized born table index - this is a separate table from the normal one,
+              * but we use the same procedure by multiplying r with scale and truncating to integer.
+              */
+             rt               = _fjsp_mul_v2r8(r00,gbscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+             _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+             Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+             vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+             twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+             fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+             dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+             dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+             gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+             velec            = _fjsp_mul_v2r8(qq00,rinv00);
+             felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 59 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+         gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*59);
+ }
index 0000000000000000000000000000000000000000,9a0b84afc68981dfcbfdf726dc0d2fbdcec99b62..9a0b84afc68981dfcbfdf726dc0d2fbdcec99b62
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,632 +1,632 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: None
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 59 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 59 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*59);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: None
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 51 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 51 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 6 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*51);
+ }
index 0000000000000000000000000000000000000000,7bd48ad434f8bb32616d33882160f8ff9cc301c8..7bd48ad434f8bb32616d33882160f8ff9cc301c8
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,552 +1,552 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: None
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     rcutoff_scalar   = fr->rvdw;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 44 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 44 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*44);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: None
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     rcutoff_scalar   = fr->rvdw;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 33 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 33 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 6 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*33);
+ }
index 0000000000000000000000000000000000000000,efe5597b6ef9c1c985df7222c57b5278479360b1..efe5597b6ef9c1c985df7222c57b5278479360b1
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,636 +1,636 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: None
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     rcutoff_scalar   = fr->rvdw;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rvdw_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 62 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 62 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*62);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: None
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     rcutoff_scalar   = fr->rvdw;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rvdw_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 59 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 59 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 6 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*59);
+ }
index 0000000000000000000000000000000000000000,8a22af6b77d4d9ad24cf97b42a1a9a7af11b5474..8a22af6b77d4d9ad24cf97b42a1a9a7af11b5474
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,498 +1,498 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: None
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 35 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 35 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*35);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: None
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 30 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             /* Load parameters for j particles */
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 30 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 6 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*30);
+ }
index 0000000000000000000000000000000000000000,d1895e97e3dc67680fcb00e709bced18a7f56b84..d1895e97e3dc67680fcb00e709bced18a7f56b84
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,733 +1,733 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 75 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 75 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 9 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*75);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 60 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 60 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*60);
+ }
index 0000000000000000000000000000000000000000,2aa77dadeef1ca1d0653a3fe9b2a81eab46484ab..2aa77dadeef1ca1d0653a3fe9b2a81eab46484ab
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1115 +1,1115 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 156 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 156 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*156);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 129 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 129 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*129);
+ }
index 0000000000000000000000000000000000000000,58ab23f822c4dc60a6812b5f9615eb4e76177c20..58ab23f822c4dc60a6812b5f9615eb4e76177c20
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2013 +1,2013 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 387 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 387 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*387);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 324 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 324 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*324);
+ }
index 0000000000000000000000000000000000000000,75eba662b17862e48c3d52a9cbb79d6936865884..75eba662b17862e48c3d52a9cbb79d6936865884
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1221 +1,1221 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 179 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 179 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*179);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 153 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 153 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*153);
+ }
index 0000000000000000000000000000000000000000,99a176c3b5e0e3124ba489e4812482dc4a843527..99a176c3b5e0e3124ba489e4812482dc4a843527
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2131 +1,2131 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 413 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 413 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*413);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 351 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 351 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*351);
+ }
index 0000000000000000000000000000000000000000,aba78402246bd98d2c81af899c1341ae5d304c3b..aba78402246bd98d2c81af899c1341ae5d304c3b
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,607 +1,607 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 57 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 57 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 9 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*57);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 40 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 40 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*40);
+ }
index 0000000000000000000000000000000000000000,8a88a9112922587438151ac56c8d673c7cb3e88b..8a88a9112922587438151ac56c8d673c7cb3e88b
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,989 +1,989 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 138 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 138 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*138);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 109 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 109 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*109);
+ }
index 0000000000000000000000000000000000000000,d3b965578b7f280f1cae18c678ceef0a9e525d0d..d3b965578b7f280f1cae18c678ceef0a9e525d0d
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1887 +1,1887 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 369 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 369 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*369);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 304 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 304 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*304);
+ }
index 0000000000000000000000000000000000000000,c4597a8ef6bf9fddd6d0879361be20e4695dd2d7..c4597a8ef6bf9fddd6d0879361be20e4695dd2d7
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1133 +1,1133 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 164 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 164 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*164);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 135 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 135 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*135);
+ }
index 0000000000000000000000000000000000000000,4ebe12c8863de197a3d53ba8debc734ae68a06fa..4ebe12c8863de197a3d53ba8debc734ae68a06fa
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2043 +1,2043 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 398 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 398 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*398);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 333 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 333 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*333);
+ }
index 0000000000000000000000000000000000000000,dd908d3059bddc8ca5cd81022c9cf4c04c1cfabc..dd908d3059bddc8ca5cd81022c9cf4c04c1cfabc
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,683 +1,683 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rvdw_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 73 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 73 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 9 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*73);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rvdw_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 64 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 64 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*64);
+ }
index 0000000000000000000000000000000000000000,36c26d1bb954f981ae9460ef1703217615715055..36c26d1bb954f981ae9460ef1703217615715055
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1065 +1,1065 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rvdw_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 154 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 154 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*154);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rvdw_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 133 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 133 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*133);
+ }
index 0000000000000000000000000000000000000000,9a66a46b5572ad2b394d539d4f26836310fb0273..9a66a46b5572ad2b394d539d4f26836310fb0273
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1963 +1,1963 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rvdw_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 385 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 385 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*385);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rvdw_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 328 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 328 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*328);
+ }
index 0000000000000000000000000000000000000000,7add7756bc63abd19db31ceb5531ca77bd9e6ba3..7add7756bc63abd19db31ceb5531ca77bd9e6ba3
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1213 +1,1213 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rvdw_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 182 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 182 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*182);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rvdw_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 161 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 161 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*161);
+ }
index 0000000000000000000000000000000000000000,0d662508bf849576aa07f85f522dff50dbc2137c..0d662508bf849576aa07f85f522dff50dbc2137c
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2123 +1,2123 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rvdw_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 416 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 416 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*416);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     rswitch_scalar   = fr->rvdw_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 359 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             d                = _fjsp_sub_v2r8(r00,rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /* Evaluate switch function */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = fvdw;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 359 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*359);
+ }
index 0000000000000000000000000000000000000000,da6aa2018f807d8e756a2d9fbf4e0c60b431fec7..da6aa2018f807d8e756a2d9fbf4e0c60b431fec7
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,534 +1,534 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 39 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 39 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 8 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*39);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 33 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             }
+             /* Inner loop uses 33 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*33);
+ }
index 0000000000000000000000000000000000000000,8556bfe29ef586050423b050c2e2c641cb98093d..8556bfe29ef586050423b050c2e2c641cb98093d
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,916 +1,916 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 120 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 120 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*120);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 102 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 102 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*102);
+ }
index 0000000000000000000000000000000000000000,281bccc025f1a94e867ba4d821ccb2e371305a4c..281bccc025f1a94e867ba4d821ccb2e371305a4c
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1820 +1,1820 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 351 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 351 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*351);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 297 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 297 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*297);
+ }
index 0000000000000000000000000000000000000000,4896f8f5054d8c99ad2167f12cd723beba7faaf4..4896f8f5054d8c99ad2167f12cd723beba7faaf4
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,916 +1,916 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 120 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 120 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*120);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 102 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+             {
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             }
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 102 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*102);
+ }
index 0000000000000000000000000000000000000000,7d929497c45d0c91685278765e655b7ebff472a4..7d929497c45d0c91685278765e655b7ebff472a4
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1820 +1,1820 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 351 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 351 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*351);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 297 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             }
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+             {
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+             fscal            = felec;
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             }
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 297 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*297);
+ }
index 0000000000000000000000000000000000000000,32d447ffe345658be8b82e82d7082a9ed96b00b7..32d447ffe345658be8b82e82d7082a9ed96b00b7
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,683 +1,683 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 70 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 70 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 9 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*70);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 57 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 57 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*57);
+ }
index 0000000000000000000000000000000000000000,681fc26e7dbb3fc393daab8e2d52f73b0d0b1b96..681fc26e7dbb3fc393daab8e2d52f73b0d0b1b96
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,989 +1,989 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 143 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 143 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*143);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 120 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 120 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*120);
+ }
index 0000000000000000000000000000000000000000,b3144168d610058146e4cedefaadc732ba99053a..b3144168d610058146e4cedefaadc732ba99053a
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1659 +1,1659 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 350 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 350 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*350);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 297 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 297 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*297);
+ }
index 0000000000000000000000000000000000000000,22d31112a2e194b6b9097fc524ac85e0127fd3be..22d31112a2e194b6b9097fc524ac85e0127fd3be
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1097 +1,1097 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 167 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 167 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*167);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 144 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 144 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*144);
+ }
index 0000000000000000000000000000000000000000,1688019c967439396ee56eb7391fb760929706eb..1688019c967439396ee56eb7391fb760929706eb
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1779 +1,1779 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 377 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 377 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*377);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            CubicSplineTable
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 324 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r00,vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /* CUBIC SPLINE TABLE DISPERSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             F                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             H                = _fjsp_setzero_v2r8();
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 324 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*324);
+ }
index 0000000000000000000000000000000000000000,dc847538941f00c962b6ddafcc15e99bd294abcc..dc847538941f00c962b6ddafcc15e99bd294abcc
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,549 +1,549 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 47 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 47 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 9 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*47);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 37 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 37 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*37);
+ }
index 0000000000000000000000000000000000000000,13978d1e0684d0632682a3768dfb194f80a14203..13978d1e0684d0632682a3768dfb194f80a14203
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,855 +1,855 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 120 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 120 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*120);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 100 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 100 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*100);
+ }
index 0000000000000000000000000000000000000000,b419e70d21a867bef45ca801964656f9a99c5178..b419e70d21a867bef45ca801964656f9a99c5178
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1525 +1,1525 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 327 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 327 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 20 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*327);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 277 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 277 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
+ }
index 0000000000000000000000000000000000000000,45cb1007832cb929bd583418d965835d72071e12..45cb1007832cb929bd583418d965835d72071e12
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,963 +1,963 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 143 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 143 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*143);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             vdwjidx0B        = 2*vdwtype[jnrB+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 123 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             vdwjidx0A        = 2*vdwtype[jnrA+0];
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 123 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*123);
+ }
index 0000000000000000000000000000000000000000,ae69a1d793e3c72c5a6ebbb31ed4a8eb84e45f8d..ae69a1d793e3c72c5a6ebbb31ed4a8eb84e45f8d
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1645 +1,1645 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 353 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 353 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 26 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*353);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            LennardJones
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     vdwjidx0A        = 2*vdwtype[inr+0];
+     c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+     c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 303 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+             fscal            = fvdw;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 303 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 24 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
+ }
index 0000000000000000000000000000000000000000,7155768e77da8e958776f24c8ab43eb7b570798e..7155768e77da8e958776f24c8ab43eb7b570798e
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,486 +1,486 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 35 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 35 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 8 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*35);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Particle-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         /* Load parameters for i particles */
+         iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 30 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+             /* Inner loop uses 30 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 7 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*30);
+ }
index 0000000000000000000000000000000000000000,e57c2e181b6d56e5dd1ce11a61081ed6775092ce..e57c2e181b6d56e5dd1ce11a61081ed6775092ce
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,792 +1,792 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 108 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 108 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*108);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water3-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 93 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq00             = _fjsp_mul_v2r8(iq0,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 93 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*93);
+ }
index 0000000000000000000000000000000000000000,7b31dd4f0687f8512330ad268d844922e1af093a..7b31dd4f0687f8512330ad268d844922e1af093a
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1468 +1,1468 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 315 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 315 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*315);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water3-Water3
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset0;
+     _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+     _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+     _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     qq00             = _fjsp_mul_v2r8(iq0,jq0);
+     qq01             = _fjsp_mul_v2r8(iq0,jq1);
+     qq02             = _fjsp_mul_v2r8(iq0,jq2);
+     qq10             = _fjsp_mul_v2r8(iq1,jq0);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq20             = _fjsp_mul_v2r8(iq2,jq0);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         fix0             = _fjsp_setzero_v2r8();
+         fiy0             = _fjsp_setzero_v2r8();
+         fiz0             = _fjsp_setzero_v2r8();
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 270 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /* Calculate displacement vector */
+             dx00             = _fjsp_sub_v2r8(ix0,jx0);
+             dy00             = _fjsp_sub_v2r8(iy0,jy0);
+             dz00             = _fjsp_sub_v2r8(iz0,jz0);
+             dx01             = _fjsp_sub_v2r8(ix0,jx1);
+             dy01             = _fjsp_sub_v2r8(iy0,jy1);
+             dz01             = _fjsp_sub_v2r8(iz0,jz1);
+             dx02             = _fjsp_sub_v2r8(ix0,jx2);
+             dy02             = _fjsp_sub_v2r8(iy0,jy2);
+             dz02             = _fjsp_sub_v2r8(iz0,jz2);
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             /* Calculate squared distance and things based on it */
+             rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+             rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+             rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+             rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+             rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+             rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+             rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+             
+             fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+             
+             fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+             fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+             fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+             
+             fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /* Inner loop uses 270 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*270);
+ }
index 0000000000000000000000000000000000000000,89c9e8c984b783d9f1fa91f93cd5f44472cbb9e1..89c9e8c984b783d9f1fa91f93cd5f44472cbb9e1
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,792 +1,792 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 108 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 108 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*108);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water4-Particle
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx0A,vdwjidx0B;
+     _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+     _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+     _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+     _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /* Inner loop uses 93 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /* Calculate displacement vector */
+             dx10             = _fjsp_sub_v2r8(ix1,jx0);
+             dy10             = _fjsp_sub_v2r8(iy1,jy0);
+             dz10             = _fjsp_sub_v2r8(iz1,jz0);
+             dx20             = _fjsp_sub_v2r8(ix2,jx0);
+             dy20             = _fjsp_sub_v2r8(iy2,jy0);
+             dz20             = _fjsp_sub_v2r8(iz2,jz0);
+             dx30             = _fjsp_sub_v2r8(ix3,jx0);
+             dy30             = _fjsp_sub_v2r8(iy3,jy0);
+             dz30             = _fjsp_sub_v2r8(iz3,jz0);
+             /* Calculate squared distance and things based on it */
+             rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+             rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+             rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+             rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+             rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+             rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+             rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+             rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+             rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+             /* Load parameters for j particles */
+             jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+             fjx0             = _fjsp_setzero_v2r8();
+             fjy0             = _fjsp_setzero_v2r8();
+             fjz0             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq10             = _fjsp_mul_v2r8(iq1,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+             
+             fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq20             = _fjsp_mul_v2r8(iq2,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+             
+             fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* Compute parameters for interactions between i and j atoms */
+             qq30             = _fjsp_mul_v2r8(iq3,jq0);
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+             
+             fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+             fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+             fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /* Inner loop uses 93 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*93);
+ }
index 0000000000000000000000000000000000000000,b6b4e44d6a4f6405db1004b070eae59042c9b04b..b6b4e44d6a4f6405db1004b070eae59042c9b04b
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1468 +1,1468 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        PotentialAndForce
+  */
+ void
+ nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Reset potential sums */
+         velecsum         = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 315 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 315 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 19 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*315);
+ }
+ /*
+  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+  * Electrostatics interaction: ReactionField
+  * VdW interaction:            None
+  * Geometry:                   Water4-Water4
+  * Calculate force/pot:        Force
+  */
+ void
+ nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     int              vdwioffset1;
+     _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+     int              vdwioffset2;
+     _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+     int              vdwioffset3;
+     _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+     int              vdwjidx1A,vdwjidx1B;
+     _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+     int              vdwjidx2A,vdwjidx2B;
+     _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+     int              vdwjidx3A,vdwjidx3B;
+     _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+     _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+     _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+     _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+     _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+     _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+     _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+     _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+     _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+     _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+     iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+     iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+     jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+     jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+     jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+     qq11             = _fjsp_mul_v2r8(iq1,jq1);
+     qq12             = _fjsp_mul_v2r8(iq1,jq2);
+     qq13             = _fjsp_mul_v2r8(iq1,jq3);
+     qq21             = _fjsp_mul_v2r8(iq2,jq1);
+     qq22             = _fjsp_mul_v2r8(iq2,jq2);
+     qq23             = _fjsp_mul_v2r8(iq2,jq3);
+     qq31             = _fjsp_mul_v2r8(iq3,jq1);
+     qq32             = _fjsp_mul_v2r8(iq3,jq2);
+     qq33             = _fjsp_mul_v2r8(iq3,jq3);
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         fix1             = _fjsp_setzero_v2r8();
+         fiy1             = _fjsp_setzero_v2r8();
+         fiz1             = _fjsp_setzero_v2r8();
+         fix2             = _fjsp_setzero_v2r8();
+         fiy2             = _fjsp_setzero_v2r8();
+         fiz2             = _fjsp_setzero_v2r8();
+         fix3             = _fjsp_setzero_v2r8();
+         fiy3             = _fjsp_setzero_v2r8();
+         fiz3             = _fjsp_setzero_v2r8();
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             fscal            = felec;
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 270 flops */
+         }
+         if(jidx<j_index_end)
+         {
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /* Calculate displacement vector */
+             dx11             = _fjsp_sub_v2r8(ix1,jx1);
+             dy11             = _fjsp_sub_v2r8(iy1,jy1);
+             dz11             = _fjsp_sub_v2r8(iz1,jz1);
+             dx12             = _fjsp_sub_v2r8(ix1,jx2);
+             dy12             = _fjsp_sub_v2r8(iy1,jy2);
+             dz12             = _fjsp_sub_v2r8(iz1,jz2);
+             dx13             = _fjsp_sub_v2r8(ix1,jx3);
+             dy13             = _fjsp_sub_v2r8(iy1,jy3);
+             dz13             = _fjsp_sub_v2r8(iz1,jz3);
+             dx21             = _fjsp_sub_v2r8(ix2,jx1);
+             dy21             = _fjsp_sub_v2r8(iy2,jy1);
+             dz21             = _fjsp_sub_v2r8(iz2,jz1);
+             dx22             = _fjsp_sub_v2r8(ix2,jx2);
+             dy22             = _fjsp_sub_v2r8(iy2,jy2);
+             dz22             = _fjsp_sub_v2r8(iz2,jz2);
+             dx23             = _fjsp_sub_v2r8(ix2,jx3);
+             dy23             = _fjsp_sub_v2r8(iy2,jy3);
+             dz23             = _fjsp_sub_v2r8(iz2,jz3);
+             dx31             = _fjsp_sub_v2r8(ix3,jx1);
+             dy31             = _fjsp_sub_v2r8(iy3,jy1);
+             dz31             = _fjsp_sub_v2r8(iz3,jz1);
+             dx32             = _fjsp_sub_v2r8(ix3,jx2);
+             dy32             = _fjsp_sub_v2r8(iy3,jy2);
+             dz32             = _fjsp_sub_v2r8(iz3,jz2);
+             dx33             = _fjsp_sub_v2r8(ix3,jx3);
+             dy33             = _fjsp_sub_v2r8(iy3,jy3);
+             dz33             = _fjsp_sub_v2r8(iz3,jz3);
+             /* Calculate squared distance and things based on it */
+             rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+             rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+             rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+             rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+             rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+             rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+             rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+             rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+             rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+             rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+             rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+             rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+             rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+             rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+             rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+             rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+             rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+             rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+             rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+             rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+             rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+             rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+             rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+             rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+             rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+             rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+             rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+             fjx1             = _fjsp_setzero_v2r8();
+             fjy1             = _fjsp_setzero_v2r8();
+             fjz1             = _fjsp_setzero_v2r8();
+             fjx2             = _fjsp_setzero_v2r8();
+             fjy2             = _fjsp_setzero_v2r8();
+             fjz2             = _fjsp_setzero_v2r8();
+             fjx3             = _fjsp_setzero_v2r8();
+             fjy3             = _fjsp_setzero_v2r8();
+             fjz3             = _fjsp_setzero_v2r8();
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+             
+             fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+             
+             fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+             fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+             fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+             
+             fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+             
+             fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+             
+             fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+             fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+             fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+             
+             fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+             
+             fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+             fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+             fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+             
+             fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+             fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+             fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /* REACTION-FIELD ELECTROSTATICS */
+             felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+             fscal            = felec;
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /* Update vectorial force */
+             fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+             fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+             fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+             
+             fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+             fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+             fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /* Inner loop uses 270 flops */
+         }
+         /* End of innermost loop */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses 18 flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*270);
+ }
index 0000000000000000000000000000000000000000,5e07e4b341cebe7dc6e72e5d109e2159371ae9ae..5e07e4b341cebe7dc6e72e5d109e2159371ae9ae
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,481 +1,481 @@@
+ /*
+  * This file is part of the GROMACS molecular simulation package.
+  *
+  * Copyright (c) 2012, by the GROMACS development team, led by
+  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+  * others, as listed in the AUTHORS file in the top-level source
+  * directory and at http://www.gromacs.org.
+  *
+  * GROMACS is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public License
+  * as published by the Free Software Foundation; either version 2.1
+  * of the License, or (at your option) any later version.
+  *
+  * GROMACS is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with GROMACS; if not, see
+  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+  *
+  * If you want to redistribute modifications to GROMACS, please
+  * consider that scientific software is very special. Version
+  * control is crucial - bugs must be traceable. We will be happy to
+  * consider code for inclusion in the official distribution, but
+  * derived work must not be called official GROMACS. Details are found
+  * in the README & COPYING files - if they are missing, get the
+  * official version at http://www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the research papers on the package. Check out http://www.gromacs.org.
+  */
+ /*
+  * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+  */
+ #ifndef nb_kernel_sparc64_hpc_ace_double_h
+ #define nb_kernel_sparc64_hpc_ace_double_h
+ #include "../nb_kernel.h"
+ nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+ nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+ nb_kernel_info_t
+ kernellist_sparc64_hpc_ace_double[] =
+ {
+     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water4Water4", "", "Force" },
+     { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+     { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+     { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+     { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+     { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+     { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" }
+ };
+ int
+ kernellist_sparc64_hpc_ace_double_size = sizeof(kernellist_sparc64_hpc_ace_double)/sizeof(kernellist_sparc64_hpc_ace_double[0]);
+ #endif
index 0000000000000000000000000000000000000000,afb925b2de25b313319f8c7b7542748e5c376e74..afb925b2de25b313319f8c7b7542748e5c376e74
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,36 +1,36 @@@
+ /*
+  * Note: this file was generated by the Gromacs c kernel generator.
+  *
+  *                This source code is part of
+  *
+  *                 G   R   O   M   A   C   S
+  *
+  * Copyright (c) 2001-2012, The GROMACS Development Team
+  *
+  * Gromacs is a library for molecular simulation and trajectory analysis,
+  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+  * a full list of developers and information, check out http://www.gromacs.org
+  *
+  * This program is free software; you can redistribute it and/or modify it under
+  * the terms of the GNU Lesser General Public License as published by the Free
+  * Software Foundation; either version 2 of the License, or (at your option) any
+  * later version.
+  *
+  * To help fund GROMACS development, we humbly ask that you cite
+  * the papers people have written on it - you can find them on the website.
+  */
+ #ifndef nb_kernel_sparc64_hpc_ace_double_h
+ #define nb_kernel_sparc64_hpc_ace_double_h
+ #include "../nb_kernel.h"
+ /* List of kernels for this architecture with metadata about them */
+ extern nb_kernel_info_t
+ kernellist_sparc64_hpc_ace_double[];
+ /* Length of kernellist_c */
+ extern int
+ kernellist_sparc64_hpc_ace_double_size;
+ #endif
index 0000000000000000000000000000000000000000,1349445a188921494d18cc3ba99c17fea21e83c8..1349445a188921494d18cc3ba99c17fea21e83c8
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1086 +1,1086 @@@
+ /* ## */
+ /* ## This file is part of the GROMACS molecular simulation package. */
+ /* ## */
+ /* ## Copyright (c) 2012, by the GROMACS development team, led by */
+ /* ## David van der Spoel, Berk Hess, Erik Lindahl, and including many */
+ /* ## others, as listed in the AUTHORS file in the top-level source */
+ /* ## directory and at http://www.gromacs.org. */
+ /* ## */
+ /* ## GROMACS is free software; you can redistribute it and/or */
+ /* ## modify it under the terms of the GNU Lesser General Public License */
+ /* ## as published by the Free Software Foundation; either version 2.1 */
+ /* ## of the License, or (at your option) any later version. */
+ /* ## */
+ /* ## GROMACS is distributed in the hope that it will be useful, */
+ /* ## but WITHOUT ANY WARRANTY; without even the implied warranty of */
+ /* ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU */
+ /* ## Lesser General Public License for more details. */
+ /* ## */
+ /* ## You should have received a copy of the GNU Lesser General Public */
+ /* ## License along with GROMACS; if not, see */
+ /* ## http://www.gnu.org/licenses, or write to the Free Software Foundation, */
+ /* ## Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA. */
+ /* ## */
+ /* ## If you want to redistribute modifications to GROMACS, please */
+ /* ## consider that scientific software is very special. Version */
+ /* ## control is crucial - bugs must be traceable. We will be happy to */
+ /* ## consider code for inclusion in the official distribution, but */
+ /* ## derived work must not be called official GROMACS. Details are found */
+ /* ## in the README & COPYING files - if they are missing, get the */
+ /* ## official version at http://www.gromacs.org. */
+ /* ## */
+ /* ## To help us fund GROMACS development, we humbly ask that you cite */
+ /* ## the research papers on the package. Check out http://www.gromacs.org. */
+ /* ## */
+ /* #if 0 */
+ #error This file must be processed with the Gromacs pre-preprocessor
+ /* #endif */
+ /* #if INCLUDE_HEADER */
+ #ifdef HAVE_CONFIG_H
+ #include <config.h>
+ #endif
+ #include <math.h>
+ #include "../nb_kernel.h"
+ #include "types/simple.h"
+ #include "vec.h"
+ #include "nrnb.h"
+ #include "kernelutil_sparc64_hpc_ace_double.h"
+ /* #endif */
+ /* ## List of variables set by the generating script:                                    */
+ /* ##                                                                                    */
+ /* ## Setttings that apply to the entire kernel:                                         */
+ /* ## KERNEL_ELEC:           String, choice for electrostatic interactions               */
+ /* ## KERNEL_VDW:            String, choice for van der Waals interactions               */
+ /* ## KERNEL_NAME:           String, name of this kernel                                 */
+ /* ## KERNEL_VF:             String telling if we calculate potential, force, or both    */
+ /* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
+ /* ##                                                                                    */
+ /* ## Setttings that apply to particles in the outer (I) or inner (J) loops:             */
+ /* ## PARTICLES_I[]/         Arrays with lists of i/j particles to use in kernel. It is  */
+ /* ## PARTICLES_J[]:         just [0] for particle geometry, but can be longer for water */
+ /* ## PARTICLES_ELEC_I[]/    Arrays with lists of i/j particle that have electrostatics  */
+ /* ## PARTICLES_ELEC_J[]:    interactions that should be calculated in this kernel.      */
+ /* ## PARTICLES_VDW_I[]/     Arrays with the list of i/j particle that have VdW          */
+ /* ## PARTICLES_VDW_J[]:     interactions that should be calculated in this kernel.      */
+ /* ##                                                                                    */
+ /* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle)   */
+ /* ## PAIRS_IJ[]:            Array with (i,j) tuples of pairs for which interactions     */
+ /* ##                        should be calculated in this kernel. Zero-charge particles  */
+ /* ##                        do not have interactions with particles without vdw, and    */
+ /* ##                        Vdw-only interactions are not evaluated in a no-vdw-kernel. */
+ /* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
+ /* ##                        For each i-j pair, the element [I][J] is a list of strings  */
+ /* ##                        defining properties/flags of this interaction. Examples     */
+ /* ##                        include 'electrostatics'/'vdw' if that type of interaction  */
+ /* ##                        should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values  */
+ /* ##                        are needed, and 'exactcutoff' or 'shift','switch' to        */
+ /* ##                        decide if the force/potential should be modified. This way  */
+ /* ##                        we only calculate values absolutely needed for each case.   */
+ /* ## Calculate the size and offset for (merged/interleaved) table data */
+ /*
+  * Gromacs nonbonded kernel:   {KERNEL_NAME}
+  * Electrostatics interaction: {KERNEL_ELEC}
+  * VdW interaction:            {KERNEL_VDW}
+  * Geometry:                   {GEOMETRY_I}-{GEOMETRY_J}
+  * Calculate force/pot:        {KERNEL_VF}
+  */
+ void
+ {KERNEL_NAME}
+                     (t_nblist * gmx_restrict                nlist,
+                      rvec * gmx_restrict                    xx,
+                      rvec * gmx_restrict                    ff,
+                      t_forcerec * gmx_restrict              fr,
+                      t_mdatoms * gmx_restrict               mdatoms,
+                      nb_kernel_data_t * gmx_restrict        kernel_data,
+                      t_nrnb * gmx_restrict                  nrnb)
+ {
+     /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
+     /* ## so there is no point in going to extremes to exclude variables that are not needed. */
+     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+      * just 0 for non-waters.
+      * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+      * jnr indices corresponding to data put in the four positions in the SIMD register.
+      */
+     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+     int              jnrA,jnrB;
+     int              j_coord_offsetA,j_coord_offsetB;
+     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+     real             rcutoff_scalar;
+     real             *shiftvec,*fshift,*x,*f;
+     _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+     /* #for I in PARTICLES_I */
+     int              vdwioffset{I};
+     _fjsp_v2r8       ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
+     /* #endfor */
+     /* #for J in PARTICLES_J */
+     int              vdwjidx{J}A,vdwjidx{J}B;
+     _fjsp_v2r8       jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
+     /* #endfor */
+     /* #for I,J in PAIRS_IJ */
+     _fjsp_v2r8       dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
+     /* #endfor */
+     /* #if KERNEL_ELEC != 'None' */
+     _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+     real             *charge;
+     /* #endif */
+     /* #if 'GeneralizedBorn' in KERNEL_ELEC */
+     _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+     _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+     real             *invsqrta,*dvda,*gbtab;
+     /* #endif */
+     /* #if KERNEL_VDW != 'None' */
+     int              nvdwtype;
+     _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+     int              *vdwtype;
+     real             *vdwparam;
+     _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+     _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+     /* #endif */
+     /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
+     _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+     real             *vftab;
+     /* #endif */
+     /* #if 'Ewald' in KERNEL_ELEC */
+     _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+     real             *ewtab;
+     /* #endif */
+     /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
+     _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+     real             rswitch_scalar,d_scalar;
+     /* #endif */
+     _fjsp_v2r8       itab_tmp;
+     _fjsp_v2r8       dummy_mask,cutoff_mask;
+     _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+     _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+     union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+     x                = xx[0];
+     f                = ff[0];
+     nri              = nlist->nri;
+     iinr             = nlist->iinr;
+     jindex           = nlist->jindex;
+     jjnr             = nlist->jjnr;
+     shiftidx         = nlist->shift;
+     gid              = nlist->gid;
+     shiftvec         = fr->shift_vec[0];
+     fshift           = fr->fshift[0];
+     /* #if KERNEL_ELEC != 'None' */
+     facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+     charge           = mdatoms->chargeA;
+     /*     #if 'ReactionField' in KERNEL_ELEC */
+     krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+     krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+     crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+     /*     #endif */
+     /* #endif */
+     /* #if KERNEL_VDW != 'None' */
+     nvdwtype         = fr->ntype;
+     vdwparam         = fr->nbfp;
+     vdwtype          = mdatoms->typeA;
+     /* #endif */
+     /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
+     vftab            = kernel_data->table_elec_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+     /* #elif 'Table' in KERNEL_ELEC */
+     vftab            = kernel_data->table_elec->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+     /* #elif 'Table' in KERNEL_VDW */
+     vftab            = kernel_data->table_vdw->data;
+     vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+     /* #endif */
+     /* #if 'Ewald' in KERNEL_ELEC */
+     sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+     /*     #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
+     ewtab            = fr->ic->tabq_coul_F;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /*     #else */
+     ewtab            = fr->ic->tabq_coul_FDV0;
+     ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+     ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+      /*     #endif */
+     /* #endif */
+     /* #if KERNEL_ELEC=='GeneralizedBorn' */
+     invsqrta         = fr->invsqrta;
+     dvda             = fr->dvda;
+     gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+     gbtab            = fr->gbtab.data;
+     gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+     /* #endif */
+     /* #if 'Water' in GEOMETRY_I */
+     /* Setup water-specific parameters */
+     inr              = nlist->iinr[0];
+     /*     #for I in PARTICLES_ELEC_I */
+     iq{I}              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+{I}]));
+     /*     #endfor */
+     /*     #for I in PARTICLES_VDW_I */
+     vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
+     /*     #endfor */
+     /* #endif */
+     /* #if 'Water' in GEOMETRY_J */
+     /*     #for J in PARTICLES_ELEC_J */
+     jq{J}              = gmx_fjsp_set1_v2r8(charge[inr+{J}]);
+     /*     #endfor */
+     /*     #for J in PARTICLES_VDW_J */
+     vdwjidx{J}A        = 2*vdwtype[inr+{J}];
+     /*     #endfor */
+     /*     #for I,J in PAIRS_IJ */
+     /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+     qq{I}{J}             = _fjsp_mul_v2r8(iq{I},jq{J});
+     /*         #endif */
+     /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+     c6_{I}{J}            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
+     c12_{I}{J}           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+     /*         #endif */
+     /*     #endfor */
+     /* #endif */
+     /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
+     /*     #if KERNEL_ELEC!='None' */
+     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+     rcutoff_scalar   = fr->rcoulomb;
+     /*     #else */
+     rcutoff_scalar   = fr->rvdw;
+     /*     #endif */
+     rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+     rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+     /* #endif */
+     /* #if KERNEL_MOD_VDW=='PotentialShift' */
+     sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+     rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+     /* #endif */
+     /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
+     /*     #if KERNEL_MOD_ELEC=='PotentialSwitch'  */
+     rswitch_scalar   = fr->rcoulomb_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /*     #else */
+     rswitch_scalar   = fr->rvdw_switch;
+     rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+     /*     #endif */
+     /* Setup switch parameters */
+     d_scalar         = rcutoff_scalar-rswitch_scalar;
+     d                = gmx_fjsp_set1_v2r8(d_scalar);
+     swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+     swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /*     #if 'Force' in KERNEL_VF */
+     swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+     swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+     swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+     /*     #endif */
+     /* #endif */
+     /* Avoid stupid compiler warnings */
+     jnrA = jnrB = 0;
+     j_coord_offsetA = 0;
+     j_coord_offsetB = 0;
+     /* ## Keep track of the floating point operations we issue for reporting! */
+     /* #define OUTERFLOPS 0 */
+     outeriter        = 0;
+     inneriter        = 0;
+     /* Start outer loop over neighborlists */
+     for(iidx=0; iidx<nri; iidx++)
+     {
+         /* Load shift vector for this list */
+         i_shift_offset   = DIM*shiftidx[iidx];
+         /* Load limits for loop over neighbors */
+         j_index_start    = jindex[iidx];
+         j_index_end      = jindex[iidx+1];
+         /* Get outer coordinate index */
+         inr              = iinr[iidx];
+         i_coord_offset   = DIM*inr;
+         /* Load i particle coords and add shift vector */
+         /* #if GEOMETRY_I == 'Particle' */
+         gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+         /* #elif GEOMETRY_I == 'Water3' */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+         /* #elif GEOMETRY_I == 'Water4' */
+         /*     #if 0 in PARTICLES_I                 */
+         gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         /*     #else                                */
+         gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+         /*     #endif                               */
+         /* #endif                                   */
+         /* #if 'Force' in KERNEL_VF */
+         /*     #for I in PARTICLES_I */
+         fix{I}             = _fjsp_setzero_v2r8();
+         fiy{I}             = _fjsp_setzero_v2r8();
+         fiz{I}             = _fjsp_setzero_v2r8();
+         /*     #endfor */
+         /* #endif */
+         /* ## For water we already preloaded parameters at the start of the kernel */
+         /* #if not 'Water' in GEOMETRY_I */
+         /* Load parameters for i particles */
+         /*     #for I in PARTICLES_ELEC_I */
+         iq{I}              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+{I}));
+         /*         #define OUTERFLOPS OUTERFLOPS+1 */
+         /*         #if KERNEL_ELEC=='GeneralizedBorn' */
+         isai{I}            = gmx_fjsp_load1_v2r8(invsqrta+inr+{I});
+         /*         #endif */
+         /*     #endfor */
+         /*     #for I in PARTICLES_VDW_I */
+         vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
+         /*     #endfor */
+         /* #endif */
+         /* #if 'Potential' in KERNEL_VF */
+         /* Reset potential sums */
+         /*     #if KERNEL_ELEC != 'None' */
+         velecsum         = _fjsp_setzero_v2r8();
+         /*     #endif */
+         /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
+         vgbsum           = _fjsp_setzero_v2r8();
+         /*     #endif */
+         /*     #if KERNEL_VDW != 'None' */
+         vvdwsum          = _fjsp_setzero_v2r8();
+         /*     #endif */
+         /* #endif */
+         /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
+         dvdasum          = _fjsp_setzero_v2r8();
+         /*     #endif */
+         /* #for ROUND in ['Loop','Epilogue'] */
+         /* #if ROUND =='Loop' */
+         /* Start inner kernel loop */
+         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+         {
+         /* ## First round is normal loop (next statement resets indentation) */
+         /*     #if 0 */
+         }
+         /*     #endif */
+         /* #else */
+         if(jidx<j_index_end)
+         {
+         /* ## Second round is epilogue */
+         /* #endif */
+         /* #define INNERFLOPS 0 */
+             /* #if ROUND =='Loop' */
+             /* Get j neighbor index, and coordinate index */
+             jnrA             = jjnr[jidx];
+             jnrB             = jjnr[jidx+1];
+             j_coord_offsetA  = DIM*jnrA;
+             j_coord_offsetB  = DIM*jnrB;
+             /* load j atom coordinates */
+             /*     #if GEOMETRY_J == 'Particle'             */
+             gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0);
+             /*     #elif GEOMETRY_J == 'Water3'             */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /*     #elif GEOMETRY_J == 'Water4'             */
+             /*         #if 0 in PARTICLES_J                 */
+             gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /*         #else                                */
+             gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /*         #endif                               */
+             /*     #endif                                   */
+             /* #else */
+             jnrA             = jjnr[jidx];
+             j_coord_offsetA  = DIM*jnrA;
+             /* load j atom coordinates */
+             /*     #if GEOMETRY_J == 'Particle'             */
+             gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0);
+             /*     #elif GEOMETRY_J == 'Water3'             */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+             /*     #elif GEOMETRY_J == 'Water4'             */
+             /*         #if 0 in PARTICLES_J                 */
+             gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                               &jy2,&jz2,&jx3,&jy3,&jz3);
+             /*         #else                                */
+             gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+             /*         #endif                               */
+             /*     #endif                                   */
+             /* #endif */
+             /* Calculate displacement vector */
+             /* #for I,J in PAIRS_IJ */
+             dx{I}{J}             = _fjsp_sub_v2r8(ix{I},jx{J});
+             dy{I}{J}             = _fjsp_sub_v2r8(iy{I},jy{J});
+             dz{I}{J}             = _fjsp_sub_v2r8(iz{I},jz{J});
+             /*     #define INNERFLOPS INNERFLOPS+3 */
+             /* #endfor */
+             /* Calculate squared distance and things based on it */
+             /* #for I,J in PAIRS_IJ */
+             rsq{I}{J}            = gmx_fjsp_calc_rsq_v2r8(dx{I}{J},dy{I}{J},dz{I}{J});
+             /*     #define INNERFLOPS INNERFLOPS+5 */
+             /* #endfor */
+             /* #for I,J in PAIRS_IJ */
+             /*     #if 'rinv' in INTERACTION_FLAGS[I][J] */
+             rinv{I}{J}           = gmx_fjsp_invsqrt_v2r8(rsq{I}{J});
+             /*         #define INNERFLOPS INNERFLOPS+5 */
+             /*     #endif */
+             /* #endfor */
+             /* #for I,J in PAIRS_IJ */
+             /*     #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
+             /*         # if 'rinv' not in INTERACTION_FLAGS[I][J] */
+             rinvsq{I}{J}         = gmx_fjsp_inv_v2r8(rsq{I}{J});
+             /*             #define INNERFLOPS INNERFLOPS+4 */
+             /*         #else */
+             rinvsq{I}{J}         = _fjsp_mul_v2r8(rinv{I}{J},rinv{I}{J});
+             /*             #define INNERFLOPS INNERFLOPS+1 */
+             /*         #endif */
+             /*     #endif */
+             /* #endfor */
+             /* #if not 'Water' in GEOMETRY_J */
+             /* Load parameters for j particles */
+             /*     #for J in PARTICLES_ELEC_J */
+             /*         #if ROUND =='Loop' */
+             jq{J}              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+{J},charge+jnrB+{J});
+             /*         #else */
+             jq{J}              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+{J});
+             /*         #endif */
+             /*         #if KERNEL_ELEC=='GeneralizedBorn' */
+             /*             #if ROUND =='Loop' */
+             isaj{J}            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+{J},invsqrta+jnrB+{J});
+             /*             #else */
+             isaj{J}            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+{J});
+             /*             #endif */
+             /*         #endif */
+             /*     #endfor */
+             /*     #for J in PARTICLES_VDW_J */
+             vdwjidx{J}A        = 2*vdwtype[jnrA+{J}];
+             /*         #if ROUND =='Loop' */
+             vdwjidx{J}B        = 2*vdwtype[jnrB+{J}];
+             /*         #endif */
+             /*     #endfor */
+             /* #endif */
+             /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
+             /*     #for J in PARTICLES_J */
+             fjx{J}             = _fjsp_setzero_v2r8();
+             fjy{J}             = _fjsp_setzero_v2r8();
+             fjz{J}             = _fjsp_setzero_v2r8();
+             /*     #endfor */
+             /* #endif */
+             /* #for I,J in PAIRS_IJ */
+             /**************************
+              * CALCULATE INTERACTIONS *
+              **************************/
+             /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+             /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
+             if (gmx_fjsp_any_lt_v2r8(rsq{I}{J},rcutoff2))
+             {
+                 /*     #if 0    ## this and the next two lines is a hack to maintain auto-indentation in template file */
+             }
+             /*         #endif */
+             /*         #define INNERFLOPS INNERFLOPS+1 */
+             /*     #endif */
+             /*     #if 'r' in INTERACTION_FLAGS[I][J] */
+             r{I}{J}              = _fjsp_mul_v2r8(rsq{I}{J},rinv{I}{J});
+              /*         #define INNERFLOPS INNERFLOPS+1 */
+             /*     #endif */
+             /*     ## For water geometries we already loaded parameters at the start of the kernel */
+             /*     #if not 'Water' in GEOMETRY_J */
+             /* Compute parameters for interactions between i and j atoms */
+             /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+             qq{I}{J}             = _fjsp_mul_v2r8(iq{I},jq{J});
+             /*             #define INNERFLOPS INNERFLOPS+1 */
+             /*         #endif */
+             /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+             /*             #if ROUND == 'Loop' */
+             gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset{I}+vdwjidx{J}A,
+                                          vdwparam+vdwioffset{I}+vdwjidx{J}B,&c6_{I}{J},&c12_{I}{J});
+             /*             #else */
+             gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset{I}+vdwjidx{J}A,&c6_{I}{J},&c12_{I}{J});
+             /*             #endif */
+             /*         #endif */
+             /*     #endif */
+             /*     #if 'table' in INTERACTION_FLAGS[I][J] */
+             /* Calculate table index by multiplying r with table scale and truncate to integer */
+             rt               = _fjsp_mul_v2r8(r{I}{J},vftabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+             twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+             _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+             /*         #define INNERFLOPS INNERFLOPS+4                          */
+             /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
+             /*             ## 3 tables, 4 data per point: multiply index by 12 */
+             vfconv.i[0]     *= 12;
+             vfconv.i[1]     *= 12;
+             /*         #elif 'Table' in KERNEL_ELEC                             */
+             /*             ## 1 table, 4 data per point: multiply index by 4   */
+             vfconv.i[0]     *= 4;
+             vfconv.i[1]     *= 4;
+             /*         #elif 'Table' in KERNEL_VDW                              */
+             /*             ## 2 tables, 4 data per point: multiply index by 8  */
+             vfconv.i[0]     *= 8;
+             vfconv.i[1]     *= 8;
+             /*         #endif                                                   */
+             /*     #endif */
+             /*     ## ELECTROSTATIC INTERACTIONS */
+             /*     #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+             /*         #if KERNEL_ELEC=='Coulomb' */
+             /* COULOMB ELECTROSTATICS */
+             velec            = _fjsp_mul_v2r8(qq{I}{J},rinv{I}{J});
+             /*             #define INNERFLOPS INNERFLOPS+1 */
+             /*             #if 'Force' in KERNEL_VF */
+             felec            = _fjsp_mul_v2r8(velec,rinvsq{I}{J});
+             /*                 #define INNERFLOPS INNERFLOPS+2 */
+             /*             #endif */
+             /*         #elif KERNEL_ELEC=='ReactionField' */
+             /* REACTION-FIELD ELECTROSTATICS */
+             /*             #if 'Potential' in KERNEL_VF */
+             velec            = _fjsp_mul_v2r8(qq{I}{J},_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq{I}{J},rinv{I}{J}),crf));
+             /*                 #define INNERFLOPS INNERFLOPS+4 */
+             /*             #endif */
+             /*             #if 'Force' in KERNEL_VF */
+             felec            = _fjsp_mul_v2r8(qq{I}{J},_fjsp_msub_v2r8(rinv{I}{J},rinvsq{I}{J},krf2));
+             /*                 #define INNERFLOPS INNERFLOPS+3 */
+             /*             #endif */
+             /*         #elif KERNEL_ELEC=='GeneralizedBorn' */
+             /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+             isaprod          = _fjsp_mul_v2r8(isai{I},isaj{J});
+             gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq{I}{J},_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+             gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+             /*             #define INNERFLOPS INNERFLOPS+5 */
+             /* Calculate generalized born table index - this is a separate table from the normal one,
+              * but we use the same procedure by multiplying r with scale and truncating to integer.
+              */
+             rt               = _fjsp_mul_v2r8(r{I}{J},gbscale);
+             itab_tmp         = _fjsp_dtox_v2r8(rt);
+             gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+             _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+             Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+             /*             #if ROUND == 'Loop' */
+             F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+             /*             #else */
+             F                = _fjsp_setzero_v2r8();
+             /*             #endif */
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+             /*             #if ROUND == 'Loop' */
+             H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+             /*             #else */
+             H                = _fjsp_setzero_v2r8();
+             /*             #endif */
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+             VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+             vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+             /*             #define INNERFLOPS INNERFLOPS+10 */
+             /*             #if 'Force' in KERNEL_VF */
+             twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+             fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+             dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r{I}{J},vgb));
+             dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+             /*             #if ROUND == 'Loop' */
+             gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj{J},isaj{J})));
+             /*             #else */
+             gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj{J},isaj{J})));
+             /*             #endif */
+             /*                 #define INNERFLOPS INNERFLOPS+13 */
+             /*             #endif */
+             velec            = _fjsp_mul_v2r8(qq{I}{J},rinv{I}{J});
+             /*                 #define INNERFLOPS INNERFLOPS+1 */
+             /*             #if 'Force' in KERNEL_VF */
+             felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv{I}{J},fgb),rinv{I}{J});
+             /*                 #define INNERFLOPS INNERFLOPS+3 */
+             /*             #endif */
+             /*         #elif KERNEL_ELEC=='Ewald' */
+             /* EWALD ELECTROSTATICS */
+             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+             ewrt             = _fjsp_mul_v2r8(r{I}{J},ewtabscale);
+             itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+             eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+             /*             #define INNERFLOPS INNERFLOPS+4 */
+             /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
+             ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+             /*                 #if ROUND == 'Loop' */
+             ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+             /*                 #else */
+             ewtabD           = _fjsp_setzero_v2r8();
+             /*                 #endif */
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+             ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+             /*                 #if ROUND == 'Loop' */
+             ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+             /*                 #else */
+             ewtabFn          = _fjsp_setzero_v2r8();
+             /*                 #endif */
+             GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+             felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+             /*                 #define INNERFLOPS INNERFLOPS+2 */
+             /*                 #if KERNEL_MOD_ELEC=='PotentialShift' */            
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq{I}{J},_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv{I}{J},sh_ewald),velec));
+             /*                     #define INNERFLOPS INNERFLOPS+7 */
+             /*                 #else */
+             velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+             velec            = _fjsp_mul_v2r8(qq{I}{J},_fjsp_sub_v2r8(rinv{I}{J},velec));
+             /*                     #define INNERFLOPS INNERFLOPS+6 */
+             /*                 #endif */
+             /*                 #if 'Force' in KERNEL_VF */
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq{I}{J},rinv{I}{J}),_fjsp_sub_v2r8(rinvsq{I}{J},felec));
+             /*                      #define INNERFLOPS INNERFLOPS+3 */
+             /*                 #endif */
+             /*             #elif KERNEL_VF=='Force' */
+             /*                 #if ROUND == 'Loop' */
+             gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                          &ewtabF,&ewtabFn);
+             /*                 #else */
+             gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+             /*                 #endif */
+             felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+             felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq{I}{J},rinv{I}{J}),_fjsp_sub_v2r8(rinvsq{I}{J},felec));
+             /*                 #define INNERFLOPS INNERFLOPS+7 */
+             /*             #endif */
+             /*         #elif KERNEL_ELEC=='CubicSplineTable' */
+             /* CUBIC SPLINE TABLE ELECTROSTATICS */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             /*             #if ROUND == 'Loop' */
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             /*             #else */
+             F                = _fjsp_setzero_v2r8();
+             /*             #endif */
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+             /*             #if ROUND == 'Loop' */
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+             /*             #else */
+             H                = _fjsp_setzero_v2r8();
+             /*             #endif */
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+             /*             #define INNERFLOPS INNERFLOPS+4 */
+             /*             #if 'Potential' in KERNEL_VF */
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             velec            = _fjsp_mul_v2r8(qq{I}{J},VV);
+             /*                 #define INNERFLOPS INNERFLOPS+3 */
+             /*             #endif */
+             /*             #if 'Force' in KERNEL_VF */
+             FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+             felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq{I}{J},FF),_fjsp_mul_v2r8(vftabscale,rinv{I}{J})));
+             /*                 #define INNERFLOPS INNERFLOPS+7 */
+             /*             #endif */
+             /*         #endif */
+             /*         ## End of check for electrostatics interaction forms */
+             /*     #endif */
+             /*     ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
+             /*     #if 'vdw' in INTERACTION_FLAGS[I][J] */
+             /*         #if KERNEL_VDW=='LennardJones' */
+             /* LENNARD-JONES DISPERSION/REPULSION */
+             rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
+             /*             #define INNERFLOPS INNERFLOPS+2 */
+             /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
+             vvdw6            = _fjsp_mul_v2r8(c6_{I}{J},rinvsix);
+             vvdw12           = _fjsp_mul_v2r8(c12_{I}{J},_fjsp_mul_v2r8(rinvsix,rinvsix));
+             /*                 #define INNERFLOPS INNERFLOPS+3 */
+             /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
+             vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_{I}{J},_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                            _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_{I}{J},sh_vdw_invrcut6,vvdw6),one_sixth));
+             /*                     #define INNERFLOPS INNERFLOPS+8 */
+             /*                 #else */
+             vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+             /*                     #define INNERFLOPS INNERFLOPS+3 */
+             /*                 #endif */
+             /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
+             /*                 #if 'Force' in KERNEL_VF */
+             fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq{I}{J});
+             /*                     #define INNERFLOPS INNERFLOPS+2 */
+             /*                 #endif */
+             /*             #elif KERNEL_VF=='Force' */
+             /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
+             fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_{I}{J},rinvsix,c6_{I}{J}),_fjsp_mul_v2r8(rinvsix,rinvsq{I}{J}));
+             /*                 #define INNERFLOPS INNERFLOPS+4 */
+             /*             #endif */
+             /*         #elif KERNEL_VDW=='CubicSplineTable' */
+             /* CUBIC SPLINE TABLE DISPERSION */
+             /*             #if 'Table' in KERNEL_ELEC */
+             vfconv.i[0]       += 4;
+             vfconv.i[1]       += 4;
+             /*             #endif                     */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+             /*             #if ROUND == 'Loop' */
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+             /*             #else */
+             F                = _fjsp_setzero_v2r8();
+             /*             #endif */
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+             /*             #if ROUND == 'Loop' */
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+             /*             #else */
+             H                = _fjsp_setzero_v2r8();
+             /*             #endif */
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             /*             #define INNERFLOPS INNERFLOPS+4 */
+             /*             #if 'Potential' in KERNEL_VF */
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw6            = _fjsp_mul_v2r8(c6_{I}{J},VV);
+             /*                 #define INNERFLOPS INNERFLOPS+3 */
+             /*             #endif */
+             /*             #if 'Force' in KERNEL_VF */
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw6            = _fjsp_mul_v2r8(c6_{I}{J},FF);
+             /*                 #define INNERFLOPS INNERFLOPS+4 */
+             /*             #endif */
+             /* CUBIC SPLINE TABLE REPULSION */
+             Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+             /*             #if ROUND == 'Loop' */
+             F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+             /*             #else */
+             F                = _fjsp_setzero_v2r8();
+             /*             #endif */
+             GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+             G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+             /*             #if ROUND == 'Loop' */
+             H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+             /*             #else */
+             H                = _fjsp_setzero_v2r8();
+             /*             #endif */
+             GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+             Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+             /*             #define INNERFLOPS INNERFLOPS+4 */
+             /*             #if 'Potential' in KERNEL_VF */
+             VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+             vvdw12           = _fjsp_mul_v2r8(c12_{I}{J},VV);
+             /*                 #define INNERFLOPS INNERFLOPS+3 */
+             /*             #endif */
+             /*             #if 'Force' in KERNEL_VF */
+             FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+             fvdw12           = _fjsp_mul_v2r8(c12_{I}{J},FF);
+             /*                 #define INNERFLOPS INNERFLOPS+5 */
+             /*             #endif */
+             /*             #if 'Potential' in KERNEL_VF */
+             vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+             /*                 #define INNERFLOPS INNERFLOPS+1 */
+             /*             #endif */
+             /*             #if 'Force' in KERNEL_VF */
+             fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv{I}{J})));
+             /*                 #define INNERFLOPS INNERFLOPS+4 */
+             /*             #endif */
+             /*         #endif */
+             /*         ## End of check for vdw interaction forms */
+             /*     #endif */
+             /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
+             /*     #if 'switch' in INTERACTION_FLAGS[I][J] */
+             d                = _fjsp_sub_v2r8(r{I}{J},rswitch);
+             d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+             d2               = _fjsp_mul_v2r8(d,d);
+             sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+             /*         #define INNERFLOPS INNERFLOPS+10 */
+             /*         #if 'Force' in KERNEL_VF */
+             dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+             /*             #define INNERFLOPS INNERFLOPS+5 */
+             /*         #endif */
+             /* Evaluate switch function */
+             /*         #if 'Force' in KERNEL_VF */
+             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+             /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
+             felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv{I}{J},_fjsp_mul_v2r8(velec,dsw)) );
+             /*                 #define INNERFLOPS INNERFLOPS+4 */
+             /*             #endif */
+             /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
+             fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv{I}{J},_fjsp_mul_v2r8(vvdw,dsw)) );
+             /*                 #define INNERFLOPS INNERFLOPS+4 */
+             /*             #endif */
+             /*         #endif */
+             /*         #if 'Potential' in KERNEL_VF */
+             /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
+             velec            = _fjsp_mul_v2r8(velec,sw);
+             /*                 #define INNERFLOPS INNERFLOPS+1 */
+             /*             #endif */
+             /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
+             vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+             /*                 #define INNERFLOPS INNERFLOPS+1 */
+             /*             #endif */
+             /*         #endif */
+             /*     #endif */
+             /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+             cutoff_mask      = _fjsp_cmplt_v2r8(rsq{I}{J},rcutoff2);
+             /*         #define INNERFLOPS INNERFLOPS+1 */
+             /*     #endif */
+             /*     #if 'Potential' in KERNEL_VF */
+             /* Update potential sum for this i atom from the interaction with this j atom. */
+             /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+             /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+             velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+             /*                 #define INNERFLOPS INNERFLOPS+1 */
+             /*             #endif                                       */
+             /*             #if ROUND == 'Epilogue' */
+             velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+             /*             #endif */
+             velecsum         = _fjsp_add_v2r8(velecsum,velec);
+             /*             #define INNERFLOPS INNERFLOPS+1 */
+             /*             #if KERNEL_ELEC=='GeneralizedBorn' */
+             /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+             vgb              = _fjsp_and_v2r8(vgb,cutoff_mask);
+             /*                 #define INNERFLOPS INNERFLOPS+1 */
+             /*             #endif                                       */
+             /*             #if ROUND == 'Epilogue' */
+             vgb              = _fjsp_unpacklo_v2r8(vgb,_fjsp_setzero_v2r8());
+             /*             #endif */
+             vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+             /*                 #define INNERFLOPS INNERFLOPS+1 */
+             /*             #endif */
+             /*         #endif */
+             /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+             /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+             vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+             /*                 #define INNERFLOPS INNERFLOPS+1 */
+             /*             #endif                                       */
+             /*             #if ROUND == 'Epilogue' */
+             vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+             /*             #endif */
+             vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+             /*             #define INNERFLOPS INNERFLOPS+1 */
+             /*         #endif */
+             /*     #endif */
+             /*     #if 'Force' in KERNEL_VF */
+             /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
+             fscal            = _fjsp_add_v2r8(felec,fvdw);
+             /*             #define INNERFLOPS INNERFLOPS+1 */
+             /*         #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
+             fscal            = felec;
+             /*         #elif 'vdw' in INTERACTION_FLAGS[I][J] */
+             fscal            = fvdw;
+             /*        #endif */
+             /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+             fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+             /*                 #define INNERFLOPS INNERFLOPS+1 */
+             /*             #endif                                       */
+             /*             #if ROUND == 'Epilogue' */
+             fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+             /*             #endif */
+             /* ## Construction of vectorial force built into FMA instructions now */
+             /* #define INNERFLOPS INNERFLOPS+3      */
+             
+             /* Update vectorial force */
+             fix{I}             = _fjsp_madd_v2r8(dx{I}{J},fscal,fix{I});
+             fiy{I}             = _fjsp_madd_v2r8(dy{I}{J},fscal,fiy{I});
+             fiz{I}             = _fjsp_madd_v2r8(dz{I}{J},fscal,fiz{I});
+             /*             #define INNERFLOPS INNERFLOPS+6 */
+             
+             /* #if GEOMETRY_I == 'Particle'             */
+             /*     #if ROUND == 'Loop' */
+             gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx{I}{J},dy{I}{J},dz{I}{J});
+             /*     #else */
+             gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx{I}{J},dy{I}{J},dz{I}{J});
+             /*     #endif */
+             /*     #define INNERFLOPS INNERFLOPS+3      */
+             /* #else                                    */
+             fjx{J}             = _fjsp_madd_v2r8(dx{I}{J},fscal,fjx{J});
+             fjy{J}             = _fjsp_madd_v2r8(dy{I}{J},fscal,fjy{J});
+             fjz{J}             = _fjsp_madd_v2r8(dz{I}{J},fscal,fjz{J});
+             /*     #define INNERFLOPS INNERFLOPS+3      */
+             /* #endif                                   */
+             /*     #endif */
+             /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+             /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
+             {
+                 /*     #endif */
+             }
+             /*     #endif */
+             /*    ## End of check for the interaction being outside the cutoff */
+             /* #endfor */
+             /* ## End of loop over i-j interaction pairs */
+             /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
+             /*     #if ROUND == 'Loop' */
+             gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+             /*     #else */
+             gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+             /*     #endif */
+             /*     #define INNERFLOPS INNERFLOPS+3      */
+             /* #elif GEOMETRY_J == 'Water3'             */
+             /*     #if ROUND == 'Loop' */
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /*     #else */
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+             /*     #endif */
+             /*     #define INNERFLOPS INNERFLOPS+9      */
+             /* #elif GEOMETRY_J == 'Water4'             */
+             /*     #if 0 in PARTICLES_J                 */
+             /*         #if ROUND == 'Loop' */
+             gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /*         #else */
+             gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /*         #endif */
+             /*         #define INNERFLOPS INNERFLOPS+12 */
+             /*     #else                                */
+             /*         #if ROUND == 'Loop' */
+             gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /*         #else */
+             gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+             /*         #endif */
+             /*         #define INNERFLOPS INNERFLOPS+9  */
+             /*     #endif                               */
+             /* #endif                                   */
+             /* Inner loop uses {INNERFLOPS} flops */
+         }
+         /* #endfor */
+         /* End of innermost loop */
+         /* #if 'Force' in KERNEL_VF */
+         /*     #if GEOMETRY_I == 'Particle'            */
+         gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /*         #define OUTERFLOPS OUTERFLOPS+6     */
+         /*     #elif GEOMETRY_I == 'Water3'            */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /*         #define OUTERFLOPS OUTERFLOPS+18    */
+         /*     #elif GEOMETRY_I == 'Water4'            */
+         /*         #if 0 in PARTICLES_I                */
+         gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset,fshift+i_shift_offset);
+         /*             #define OUTERFLOPS OUTERFLOPS+24    */
+         /*         #else                               */
+         gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
+         /*             #define OUTERFLOPS OUTERFLOPS+18    */
+         /*         #endif                              */
+         /*     #endif                                  */
+         /* #endif                                      */
+         /* #if 'Potential' in KERNEL_VF */
+         ggid                        = gid[iidx];
+         /* Update potential energies */
+         /*     #if KERNEL_ELEC != 'None' */
+         gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+         /*         #define OUTERFLOPS OUTERFLOPS+1 */
+         /*     #endif */
+         /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
+         gmx_fjsp_update_1pot_v2r8(vgbsum,kernel_data->energygrp_polarization+ggid);
+         /*         #define OUTERFLOPS OUTERFLOPS+1 */
+         /*     #endif */
+         /*     #if KERNEL_VDW != 'None' */
+         gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+         /*         #define OUTERFLOPS OUTERFLOPS+1 */
+         /*     #endif */
+         /* #endif */
+         /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
+         dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai{I},isai{I}));
+         gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+         /*     #endif */
+         /* Increment number of inner iterations */
+         inneriter                  += j_index_end - j_index_start;
+         /* Outer loop uses {OUTERFLOPS} flops */
+     }
+     /* Increment number of outer iterations */
+     outeriter        += nri;
+     /* Update outer/inner flops */
+     /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
+     /* ## primitive and replaces aggressively even in strings inside these directives, we need to      */
+     /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source.      */
+     /* #if GEOMETRY_I == 'Water3'            */
+     /*     #define ISUFFIX '_W3'             */
+     /* #elif GEOMETRY_I == 'Water4'          */
+     /*     #define ISUFFIX '_W4'             */
+     /* #else                                 */
+     /*     #define ISUFFIX ''                */
+     /* #endif                                */
+     /* #if GEOMETRY_J == 'Water3'            */
+     /*     #define JSUFFIX 'W3'              */
+     /* #elif GEOMETRY_J == 'Water4'          */
+     /*     #define JSUFFIX 'W4'              */
+     /* #else                                 */
+     /*     #define JSUFFIX ''                */
+     /* #endif                                */
+     /* #if 'PotentialAndForce' in KERNEL_VF  */
+     /*     #define VFSUFFIX  '_VF'           */
+     /* #elif 'Potential' in KERNEL_VF        */
+     /*     #define VFSUFFIX '_V'             */
+     /* #else                                 */
+     /*     #define VFSUFFIX '_F'             */
+     /* #endif                                */
+     /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+     /* #elif KERNEL_ELEC != 'None' */
+     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+     /* #else */
+     inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+     /* #endif  */
+ }
index afe5f56351bd72b9186b25cd67aeeb125dcc3599,0000000000000000000000000000000000000000..ac5fe893d4bce8455a49506e2f122680893b7e1f
mode 100644,000000..100644
--- /dev/null
@@@ -1,675 -1,0 +1,685 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef GMX_THREAD_MPI
 +#include <thread_mpi.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include "typedefs.h"
 +#include "txtdump.h"
 +#include "smalloc.h"
 +#include "ns.h"
 +#include "vec.h"
 +#include "maths.h"
 +#include "macros.h"
 +#include "string2.h"
 +#include "force.h"
 +#include "names.h"
 +#include "main.h"
 +#include "xvgr.h"
 +#include "gmx_fatal.h"
 +#include "physics.h"
 +#include "force.h"
 +#include "bondf.h"
 +#include "nrnb.h"
 +#include "smalloc.h"
 +#include "nonbonded.h"
 +
 +#include "nb_kernel.h"
 +#include "nb_free_energy.h"
 +#include "nb_generic.h"
 +#include "nb_generic_cg.h"
 +#include "nb_generic_adress.h"
 +
 +/* Different default (c) and accelerated interaction-specific kernels */
 +#include "nb_kernel_c/nb_kernel_c.h"
 +
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
 +#    include "nb_kernel_sse2_single/nb_kernel_sse2_single.h"
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE)
 +#    include "nb_kernel_sse4_1_single/nb_kernel_sse4_1_single.h"
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
 +#    include "nb_kernel_avx_128_fma_single/nb_kernel_avx_128_fma_single.h"
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE)
 +#    include "nb_kernel_avx_256_single/nb_kernel_avx_256_single.h"
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE)
 +#    include "nb_kernel_sse2_double/nb_kernel_sse2_double.h"
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1 && defined GMX_DOUBLE)
 +#    include "nb_kernel_sse4_1_double/nb_kernel_sse4_1_double.h"
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA && defined GMX_DOUBLE)
 +#    include "nb_kernel_avx_128_fma_double/nb_kernel_avx_128_fma_double.h"
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE)
 +#    include "nb_kernel_avx_256_double/nb_kernel_avx_256_double.h"
 +#endif
++#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE)
++#    include "nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h"
++#endif
 +
 +
 +#ifdef GMX_THREAD_MPI
 +static tMPI_Thread_mutex_t nonbonded_setup_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
 +#endif
 +static gmx_bool            nonbonded_setup_done  = FALSE;
 +
 +
 +void
 +gmx_nonbonded_setup(FILE *         fplog,
 +                    t_forcerec *   fr,
 +                    gmx_bool       bGenericKernelOnly)
 +{
 +#ifdef GMX_THREAD_MPI
 +    tMPI_Thread_mutex_lock(&nonbonded_setup_mutex);
 +#endif
 +    /* Here we are guaranteed only one thread made it. */
 +    if (nonbonded_setup_done == FALSE)
 +    {
 +        if (bGenericKernelOnly == FALSE)
 +        {
 +            /* Add the generic kernels to the structure stored statically in nb_kernel.c */
 +            nb_kernel_list_add_kernels(kernellist_c, kernellist_c_size);
 +
 +            if (!(fr != NULL && fr->use_cpu_acceleration == FALSE))
 +            {
 +                /* Add interaction-specific kernels for different architectures */
 +                /* Single precision */
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_sse2_single, kernellist_sse2_single_size);
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_sse4_1_single, kernellist_sse4_1_single_size);
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_avx_128_fma_single, kernellist_avx_128_fma_single_size);
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_avx_256_single, kernellist_avx_256_single_size);
 +#endif
 +                /* Double precision */
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_sse2_double, kernellist_sse2_double_size);
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1 && defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_sse4_1_double, kernellist_sse4_1_double_size);
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA && defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_avx_128_fma_double, kernellist_avx_128_fma_double_size);
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_avx_256_double, kernellist_avx_256_double_size);
++#endif
++#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE)
++                nb_kernel_list_add_kernels(kernellist_sparc64_hpc_ace_double,kernellist_sparc64_hpc_ace_double_size);
 +#endif
 +                ; /* empty statement to avoid a completely empty block */
 +            }
 +        }
 +        /* Create a hash for faster lookups */
 +        nb_kernel_list_hash_init();
 +
 +        nonbonded_setup_done = TRUE;
 +    }
 +#ifdef GMX_THREAD_MPI
 +    tMPI_Thread_mutex_unlock(&nonbonded_setup_mutex);
 +#endif
 +}
 +
 +
 +
 +void
 +gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl)
 +{
 +    const char *     elec;
 +    const char *     elec_mod;
 +    const char *     vdw;
 +    const char *     vdw_mod;
 +    const char *     geom;
 +    const char *     other;
 +    const char *     vf;
 +
 +    struct
 +    {
 +        const char *  arch;
 +        int           simd_padding_width;
 +    }
 +    arch_and_padding[] =
 +    {
 +        /* Single precision */
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE)
 +        { "avx_256_single", 8 },
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
 +        { "avx_128_fma_single", 4 },
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE)
 +        { "sse4_1_single", 4 },
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
 +        { "sse2_single", 4 },
 +#endif
 +        /* Double precision */
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE)
 +        { "avx_256_double", 4 },
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA && defined GMX_DOUBLE)
 +        /* Sic. Double precision 2-way SIMD does not require neighbor list padding,
 +         * since the kernels execute a loop unrolled a factor 2, followed by
 +         * a possible single odd-element epilogue.
 +         */
 +        { "avx_128_fma_double", 1 },
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE)
 +        /* No padding - see comment above */
 +        { "sse2_double", 1 },
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1 && defined GMX_DOUBLE)
 +        /* No padding - see comment above */
 +        { "sse4_1_double", 1 },
++#endif
++#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE)
++        /* No padding - see comment above */
++        { "sparc64_hpc_ace_double", 1 },
 +#endif
 +        { "c", 1 },
 +    };
 +    int              narch = asize(arch_and_padding);
 +    int              i;
 +
 +    if (nonbonded_setup_done == FALSE)
 +    {
 +        /* We typically call this setup routine before starting timers,
 +         * but if that has not been done for whatever reason we do it now.
 +         */
 +        gmx_nonbonded_setup(NULL, NULL, FALSE);
 +    }
 +
 +    /* Not used yet */
 +    other = "";
 +
 +    nl->kernelptr_vf = NULL;
 +    nl->kernelptr_v  = NULL;
 +    nl->kernelptr_f  = NULL;
 +
 +    elec     = gmx_nbkernel_elec_names[nl->ielec];
 +    elec_mod = eintmod_names[nl->ielecmod];
 +    vdw      = gmx_nbkernel_vdw_names[nl->ivdw];
 +    vdw_mod  = eintmod_names[nl->ivdwmod];
 +    geom     = gmx_nblist_geometry_names[nl->igeometry];
 +
 +    if (nl->type == GMX_NBLIST_INTERACTION_ADRESS)
 +    {
 +        nl->kernelptr_vf       = (void *) gmx_nb_generic_adress_kernel;
 +        nl->kernelptr_f        = (void *) gmx_nb_generic_adress_kernel;
 +        nl->simd_padding_width = 1;
 +        return;
 +    }
 +
 +    if (nl->type == GMX_NBLIST_INTERACTION_FREE_ENERGY)
 +    {
 +        nl->kernelptr_vf       = (void *) gmx_nb_free_energy_kernel;
 +        nl->kernelptr_f        = (void *) gmx_nb_free_energy_kernel;
 +        nl->simd_padding_width = 1;
 +    }
 +    else if (!gmx_strcasecmp_min(geom, "CG-CG"))
 +    {
 +        nl->kernelptr_vf       = (void *) gmx_nb_generic_cg_kernel;
 +        nl->kernelptr_f        = (void *) gmx_nb_generic_cg_kernel;
 +        nl->simd_padding_width = 1;
 +    }
 +    else
 +    {
 +        /* Try to find a specific kernel first */
 +
 +        for (i = 0; i < narch && nl->kernelptr_vf == NULL; i++)
 +        {
 +            nl->kernelptr_vf       = (void *) nb_kernel_list_findkernel(log, arch_and_padding[i].arch, elec, elec_mod, vdw, vdw_mod, geom, other, "PotentialAndForce");
 +            nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
 +        }
 +        for (i = 0; i < narch && nl->kernelptr_f == NULL; i++)
 +        {
 +            nl->kernelptr_f        = (void *) nb_kernel_list_findkernel(log, arch_and_padding[i].arch, elec, elec_mod, vdw, vdw_mod, geom, other, "Force");
 +            nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
 +
 +            /* If there is not force-only optimized kernel, is there a potential & force one? */
 +            if (nl->kernelptr_f == NULL)
 +            {
 +                nl->kernelptr_f        = (void *) nb_kernel_list_findkernel(NULL, arch_and_padding[i].arch, elec, elec_mod, vdw, vdw_mod, geom, other, "PotentialAndForce");
 +                nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
 +            }
 +        }
 +
 +        /* Give up, pick a generic one instead */
 +        if (nl->kernelptr_vf == NULL)
 +        {
 +            nl->kernelptr_vf       = (void *) gmx_nb_generic_kernel;
 +            nl->kernelptr_f        = (void *) gmx_nb_generic_kernel;
 +            nl->simd_padding_width = 1;
 +            if (debug)
 +            {
 +                fprintf(debug,
 +                        "WARNING - Slow generic NB kernel used for neighborlist with\n"
 +                        "    Elec: '%s', Modifier: '%s'\n"
 +                        "    Vdw:  '%s', Modifier: '%s'\n"
 +                        "    Geom: '%s', Other: '%s'\n\n",
 +                        elec, elec_mod, vdw, vdw_mod, geom, other);
 +            }
 +        }
 +    }
 +
 +    return;
 +}
 +
 +void do_nonbonded(t_commrec *cr, t_forcerec *fr,
 +                  rvec x[], rvec f_shortrange[], rvec f_longrange[], t_mdatoms *mdatoms, t_blocka *excl,
 +                  gmx_grppairener_t *grppener, rvec box_size,
 +                  t_nrnb *nrnb, real *lambda, real *dvdl,
 +                  int nls, int eNL, int flags)
 +{
 +    t_nblist *        nlist;
 +    int               n, n0, n1, i, i0, i1, sz, range;
 +    t_nblists *       nblists;
 +    nb_kernel_data_t  kernel_data;
 +    nb_kernel_t *     kernelptr = NULL;
 +    rvec *            f;
 +
 +    kernel_data.flags                   = flags;
 +    kernel_data.exclusions              = excl;
 +    kernel_data.lambda                  = lambda;
 +    kernel_data.dvdl                    = dvdl;
 +
 +    if (fr->bAllvsAll)
 +    {
 +        return;
 +    }
 +
 +    if (eNL >= 0)
 +    {
 +        i0 = eNL;
 +        i1 = i0+1;
 +    }
 +    else
 +    {
 +        i0 = 0;
 +        i1 = eNL_NR;
 +    }
 +
 +    if (nls >= 0)
 +    {
 +        n0 = nls;
 +        n1 = nls+1;
 +    }
 +    else
 +    {
 +        n0 = 0;
 +        n1 = fr->nnblists;
 +    }
 +
 +    for (n = n0; (n < n1); n++)
 +    {
 +        nblists = &fr->nblists[n];
 +
 +        kernel_data.table_elec              = &nblists->table_elec;
 +        kernel_data.table_vdw               = &nblists->table_vdw;
 +        kernel_data.table_elec_vdw          = &nblists->table_elec_vdw;
 +
 +        for (range = 0; range < 2; range++)
 +        {
 +            /* Are we doing short/long-range? */
 +            if (range == 0)
 +            {
 +                /* Short-range */
 +                if (!(flags & GMX_NONBONDED_DO_SR))
 +                {
 +                    continue;
 +                }
 +                kernel_data.energygrp_elec          = grppener->ener[egCOULSR];
 +                kernel_data.energygrp_vdw           = grppener->ener[fr->bBHAM ? egBHAMSR : egLJSR];
 +                kernel_data.energygrp_polarization  = grppener->ener[egGB];
 +                nlist = nblists->nlist_sr;
 +                f                                   = f_shortrange;
 +            }
 +            else if (range == 1)
 +            {
 +                /* Long-range */
 +                if (!(flags & GMX_NONBONDED_DO_LR))
 +                {
 +                    continue;
 +                }
 +                kernel_data.energygrp_elec          = grppener->ener[egCOULLR];
 +                kernel_data.energygrp_vdw           = grppener->ener[fr->bBHAM ? egBHAMLR : egLJLR];
 +                kernel_data.energygrp_polarization  = grppener->ener[egGB];
 +                nlist = nblists->nlist_lr;
 +                f                                   = f_longrange;
 +            }
 +
 +            for (i = i0; (i < i1); i++)
 +            {
 +                if (nlist[i].nri > 0)
 +                {
 +                    if (flags & GMX_NONBONDED_DO_POTENTIAL)
 +                    {
 +                        /* Potential and force */
 +                        kernelptr = (nb_kernel_t *)nlist[i].kernelptr_vf;
 +                    }
 +                    else
 +                    {
 +                        /* Force only, no potential */
 +                        kernelptr = (nb_kernel_t *)nlist[i].kernelptr_f;
 +                    }
 +
 +                    if (nlist[i].type != GMX_NBLIST_INTERACTION_FREE_ENERGY && (flags & GMX_NONBONDED_DO_FOREIGNLAMBDA))
 +                    {
 +                        /* We don't need the non-perturbed interactions */
 +                        continue;
 +                    }
 +                    (*kernelptr)(&(nlist[i]), x, f, fr, mdatoms, &kernel_data, nrnb);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static void
 +nb_listed_warning_rlimit(const rvec *x, int ai, int aj, int * global_atom_index, real r, real rlimit)
 +{
 +    gmx_warning("Listed nonbonded interaction between particles %d and %d\n"
 +                "at distance %.3f which is larger than the table limit %.3f nm.\n\n"
 +                "This is likely either a 1,4 interaction, or a listed interaction inside\n"
 +                "a smaller molecule you are decoupling during a free energy calculation.\n"
 +                "Since interactions at distances beyond the table cannot be computed,\n"
 +                "they are skipped until they are inside the table limit again. You will\n"
 +                "only see this message once, even if it occurs for several interactions.\n\n"
 +                "IMPORTANT: This should not happen in a stable simulation, so there is\n"
 +                "probably something wrong with your system. Only change the table-extension\n"
 +                "distance in the mdp file if you are really sure that is the reason.\n",
 +                glatnr(global_atom_index, ai), glatnr(global_atom_index, aj), r, rlimit);
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "%8f %8f %8f\n%8f %8f %8f\n1-4 (%d,%d) interaction not within cut-off! r=%g. Ignored\n",
 +                x[ai][XX], x[ai][YY], x[ai][ZZ], x[aj][XX], x[aj][YY], x[aj][ZZ],
 +                glatnr(global_atom_index, ai), glatnr(global_atom_index, aj), r);
 +    }
 +}
 +
 +
 +
 +/* This might logically belong better in the nb_generic.c module, but it is only
 + * used in do_nonbonded_listed(), and we want it to be inlined there to avoid an
 + * extra functional call for every single pair listed in the topology.
 + */
 +static real
 +nb_evaluate_single(real r2, real tabscale, real *vftab,
 +                   real qq, real c6, real c12, real *velec, real *vvdw)
 +{
 +    real       rinv, r, rtab, eps, eps2, Y, F, Geps, Heps2, Fp, VVe, FFe, VVd, FFd, VVr, FFr, fscal;
 +    int        ntab;
 +
 +    /* Do the tabulated interactions - first table lookup */
 +    rinv             = gmx_invsqrt(r2);
 +    r                = r2*rinv;
 +    rtab             = r*tabscale;
 +    ntab             = rtab;
 +    eps              = rtab-ntab;
 +    eps2             = eps*eps;
 +    ntab             = 12*ntab;
 +    /* Electrostatics */
 +    Y                = vftab[ntab];
 +    F                = vftab[ntab+1];
 +    Geps             = eps*vftab[ntab+2];
 +    Heps2            = eps2*vftab[ntab+3];
 +    Fp               = F+Geps+Heps2;
 +    VVe              = Y+eps*Fp;
 +    FFe              = Fp+Geps+2.0*Heps2;
 +    /* Dispersion */
 +    Y                = vftab[ntab+4];
 +    F                = vftab[ntab+5];
 +    Geps             = eps*vftab[ntab+6];
 +    Heps2            = eps2*vftab[ntab+7];
 +    Fp               = F+Geps+Heps2;
 +    VVd              = Y+eps*Fp;
 +    FFd              = Fp+Geps+2.0*Heps2;
 +    /* Repulsion */
 +    Y                = vftab[ntab+8];
 +    F                = vftab[ntab+9];
 +    Geps             = eps*vftab[ntab+10];
 +    Heps2            = eps2*vftab[ntab+11];
 +    Fp               = F+Geps+Heps2;
 +    VVr              = Y+eps*Fp;
 +    FFr              = Fp+Geps+2.0*Heps2;
 +
 +    *velec           = qq*VVe;
 +    *vvdw            = c6*VVd+c12*VVr;
 +
 +    fscal            = -(qq*FFe+c6*FFd+c12*FFr)*tabscale*rinv;
 +
 +    return fscal;
 +}
 +
 +
 +real
 +do_nonbonded_listed(int ftype, int nbonds,
 +                    const t_iatom iatoms[], const t_iparams iparams[],
 +                    const rvec x[], rvec f[], rvec fshift[],
 +                    const t_pbc *pbc, const t_graph *g,
 +                    real *lambda, real *dvdl,
 +                    const t_mdatoms *md,
 +                    const t_forcerec *fr, gmx_grppairener_t *grppener,
 +                    int *global_atom_index)
 +{
 +    int              ielec, ivdw;
 +    real             qq, c6, c12;
 +    rvec             dx;
 +    ivec             dt;
 +    int              i, j, itype, ai, aj, gid;
 +    int              fshift_index;
 +    real             r2, rinv;
 +    real             fscal, velec, vvdw;
 +    real *           energygrp_elec;
 +    real *           energygrp_vdw;
 +    static gmx_bool  warned_rlimit = FALSE;
 +    /* Free energy stuff */
 +    gmx_bool         bFreeEnergy;
 +    real             LFC[2], LFV[2], DLF[2], lfac_coul[2], lfac_vdw[2], dlfac_coul[2], dlfac_vdw[2];
 +    real             qqB, c6B, c12B, sigma2_def, sigma2_min;
 +
 +
 +    switch (ftype)
 +    {
 +        case F_LJ14:
 +        case F_LJC14_Q:
 +            energygrp_elec = grppener->ener[egCOUL14];
 +            energygrp_vdw  = grppener->ener[egLJ14];
 +            break;
 +        case F_LJC_PAIRS_NB:
 +            energygrp_elec = grppener->ener[egCOULSR];
 +            energygrp_vdw  = grppener->ener[egLJSR];
 +            break;
 +        default:
 +            energygrp_elec = NULL; /* Keep compiler happy */
 +            energygrp_vdw  = NULL; /* Keep compiler happy */
 +            gmx_fatal(FARGS, "Unknown function type %d in do_nonbonded14", ftype);
 +            break;
 +    }
 +
 +    if (fr->efep != efepNO)
 +    {
 +        /* Lambda factor for state A=1-lambda and B=lambda */
 +        LFC[0] = 1.0 - lambda[efptCOUL];
 +        LFV[0] = 1.0 - lambda[efptVDW];
 +        LFC[1] = lambda[efptCOUL];
 +        LFV[1] = lambda[efptVDW];
 +
 +        /*derivative of the lambda factor for state A and B */
 +        DLF[0] = -1;
 +        DLF[1] = 1;
 +
 +        /* precalculate */
 +        sigma2_def = pow(fr->sc_sigma6_def, 1.0/3.0);
 +        sigma2_min = pow(fr->sc_sigma6_min, 1.0/3.0);
 +
 +        for (i = 0; i < 2; i++)
 +        {
 +            lfac_coul[i]  = (fr->sc_power == 2 ? (1-LFC[i])*(1-LFC[i]) : (1-LFC[i]));
 +            dlfac_coul[i] = DLF[i]*fr->sc_power/fr->sc_r_power*(fr->sc_power == 2 ? (1-LFC[i]) : 1);
 +            lfac_vdw[i]   = (fr->sc_power == 2 ? (1-LFV[i])*(1-LFV[i]) : (1-LFV[i]));
 +            dlfac_vdw[i]  = DLF[i]*fr->sc_power/fr->sc_r_power*(fr->sc_power == 2 ? (1-LFV[i]) : 1);
 +        }
 +    }
 +    else
 +    {
 +        sigma2_min = sigma2_def = 0;
 +    }
 +
 +    bFreeEnergy = FALSE;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        itype = iatoms[i++];
 +        ai    = iatoms[i++];
 +        aj    = iatoms[i++];
 +        gid   = GID(md->cENER[ai], md->cENER[aj], md->nenergrp);
 +
 +        /* Get parameters */
 +        switch (ftype)
 +        {
 +            case F_LJ14:
 +                bFreeEnergy =
 +                    (fr->efep != efepNO &&
 +                     ((md->nPerturbed && (md->bPerturbed[ai] || md->bPerturbed[aj])) ||
 +                      iparams[itype].lj14.c6A != iparams[itype].lj14.c6B ||
 +                      iparams[itype].lj14.c12A != iparams[itype].lj14.c12B));
 +                qq               = md->chargeA[ai]*md->chargeA[aj]*fr->epsfac*fr->fudgeQQ;
 +                c6               = iparams[itype].lj14.c6A;
 +                c12              = iparams[itype].lj14.c12A;
 +                break;
 +            case F_LJC14_Q:
 +                qq               = iparams[itype].ljc14.qi*iparams[itype].ljc14.qj*fr->epsfac*iparams[itype].ljc14.fqq;
 +                c6               = iparams[itype].ljc14.c6;
 +                c12              = iparams[itype].ljc14.c12;
 +                break;
 +            case F_LJC_PAIRS_NB:
 +                qq               = iparams[itype].ljcnb.qi*iparams[itype].ljcnb.qj*fr->epsfac;
 +                c6               = iparams[itype].ljcnb.c6;
 +                c12              = iparams[itype].ljcnb.c12;
 +                break;
 +            default:
 +                /* Cannot happen since we called gmx_fatal() above in this case */
 +                qq = c6 = c12 = 0; /* Keep compiler happy */
 +                break;
 +        }
 +
 +        /* To save flops in the optimized kernels, c6/c12 have 6.0/12.0 derivative prefactors
 +         * included in the general nfbp array now. This means the tables are scaled down by the
 +         * same factor, so when we use the original c6/c12 parameters from iparams[] they must
 +         * be scaled up.
 +         */
 +        c6  *= 6.0;
 +        c12 *= 12.0;
 +
 +        /* Do we need to apply full periodic boundary conditions? */
 +        if (fr->bMolPBC == TRUE)
 +        {
 +            fshift_index = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 +        }
 +        else
 +        {
 +            fshift_index = CENTRAL;
 +            rvec_sub(x[ai], x[aj], dx);
 +        }
 +        r2           = norm2(dx);
 +
 +        if (r2 >= fr->tab14.r*fr->tab14.r)
 +        {
 +            if (warned_rlimit == FALSE)
 +            {
 +                nb_listed_warning_rlimit(x, ai, aj, global_atom_index, sqrt(r2), fr->tab14.r);
 +                warned_rlimit = TRUE;
 +            }
 +            continue;
 +        }
 +
 +        if (bFreeEnergy)
 +        {
 +            /* Currently free energy is only supported for F_LJ14, so no need to check for that if we got here */
 +            qqB              = md->chargeB[ai]*md->chargeB[aj]*fr->epsfac*fr->fudgeQQ;
 +            c6B              = iparams[itype].lj14.c6B*6.0;
 +            c12B             = iparams[itype].lj14.c12B*12.0;
 +
 +            fscal            = nb_free_energy_evaluate_single(r2, fr->sc_r_power, fr->sc_alphacoul, fr->sc_alphavdw,
 +                                                              fr->tab14.scale, fr->tab14.data, qq, c6, c12, qqB, c6B, c12B,
 +                                                              LFC, LFV, DLF, lfac_coul, lfac_vdw, dlfac_coul, dlfac_vdw,
 +                                                              fr->sc_sigma6_def, fr->sc_sigma6_min, sigma2_def, sigma2_min, &velec, &vvdw, dvdl);
 +        }
 +        else
 +        {
 +            /* Evaluate tabulated interaction without free energy */
 +            fscal            = nb_evaluate_single(r2, fr->tab14.scale, fr->tab14.data, qq, c6, c12, &velec, &vvdw);
 +        }
 +
 +        energygrp_elec[gid]  += velec;
 +        energygrp_vdw[gid]   += vvdw;
 +        svmul(fscal, dx, dx);
 +
 +        /* Add the forces */
 +        rvec_inc(f[ai], dx);
 +        rvec_dec(f[aj], dx);
 +
 +        if (g)
 +        {
 +            /* Correct the shift forces using the graph */
 +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
 +            fshift_index = IVEC2IS(dt);
 +        }
 +        if (fshift_index != CENTRAL)
 +        {
 +            rvec_inc(fshift[fshift_index], dx);
 +            rvec_dec(fshift[CENTRAL], dx);
 +        }
 +    }
 +    return 0.0;
 +}
index 06b389d5cedcf9731ae7bbf0c8091bb04fab9fbf,6aa852f00e2667c81970a6a09dd36c426ba91420..6aa852f00e2667c81970a6a09dd36c426ba91420
@@@ -10,7 -10,7 +10,7 @@@ set(THREAD_MPI_LIB_SOURC
      errhandler.c    p2p_send_recv.c type.c
      event.c         p2p_wait.c      tmpi_malloc.c
      gather.c        profile.c
-     group.c         numa_malloc.c   )
+     group.c         numa_malloc.c   atomic.c)
  
  
  if (THREAD_PTHREADS)
index 0000000000000000000000000000000000000000,b9f5e0e2eea6712904d4dffe8acf1c30e8d8bf6e..b9f5e0e2eea6712904d4dffe8acf1c30e8d8bf6e
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,234 +1,234 @@@
+ /*
+    This source code file is part of thread_mpi.
+    Written by Sander Pronk, Erik Lindahl, and possibly others.
+    Copyright (c) 2009, Sander Pronk, Erik Lindahl.
+    All rights reserved.
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+    1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+    3) Neither the name of the copyright holders nor the
+    names of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written permission.
+    THIS SOFTWARE IS PROVIDED BY US ''AS IS'' AND ANY
+    EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL WE BE LIABLE FOR ANY
+    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+    If you want to redistribute modifications, please consider that
+    scientific software is very special. Version control is crucial -
+    bugs must be traceable. We will be happy to consider code for
+    inclusion in the official distribution, but derived work should not
+    be called official thread_mpi. Details are found in the README & COPYING
+    files.
+  */
+ #include "impl.h"
+ /* This file is only needed when no intrinsic atomic operations are present. */
+ #ifdef TMPI_NO_ATOMICS
+ /** System mutex used for locking to guarantee atomicity */
+ static tMPI_Thread_mutex_t tMPI_Atomic_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+ struct tMPI_Spinlock
+ {
+     tMPI_Thread_mutex_t *lock;
+ };
+ int tMPI_Atomic_get(const tMPI_Atomic_t *a)
+ {
+     int ret;
+     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+     ret = a->value;
+     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+     return ret;
+ }
+ void tMPI_Atomic_set(tMPI_Atomic_t *a, int value)
+ {
+     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+     a->value = value;
+     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+ }
+ void* tMPI_Atomic_ptr_get(const tMPI_Atomic_ptr_t *a)
+ {
+     void* ret;
+     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+     ret = a->value;
+     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+     return ret;
+ }
+ void tMPI_Atomic_ptr_set(tMPI_Atomic_ptr_t *a, void *value)
+ {
+     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+     a->value = value;
+     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+ }
+ int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
+ {
+     int t;
+     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+     t        = a->value + i;
+     a->value = t;
+     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+     return t;
+ }
+ int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
+ {
+     int old_value;
+     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+     old_value = a->value;
+     a->value  = old_value + i;
+     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+     return old_value;
+ }
+ int tMPI_Atomic_cas(tMPI_Atomic_t *a, int old_val, int new_val)
+ {
+     int t = 0;
+     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+     if (a->value == old_val)
+     {
+         a->value = new_val;
+         t        = 1;
+     }
+     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+     return t;
+ }
+ int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t * a, void *old_val, void *new_val)
+ {
+     int t = 0;
+     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+     if (a->value == old_val)
+     {
+         a->value = new_val;
+         t        = 1;
+     }
+     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+     return t;
+ }
+ int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
+ {
+     int ret;
+     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+     ret      = a->value;
+     a->value = b;
+     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+     return ret;
+ }
+ void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
+ {
+     void *ret;
+     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+     ret      = a->value;
+     a->value = b;
+     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+     return ret;
+ }
+ void tMPI_Spinlock_init( tMPI_Spinlock_t *x)
+ {
+     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+     *x         = (tMPI_Spinlock_t)malloc(sizeof(tMPI_Spinlock_t));
+     (*x)->lock = (tMPI_Thread_mutex_t*)malloc(sizeof(tMPI_Thread_mutex_t));
+     tMPI_Thread_mutex_init((*x)->lock);
+     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+ }
+ /* NOTE: assumes atomic mutex is locked */
+ static void tMPI_Spinlock_init_once(tMPI_Spinlock_t *x)
+ {
+     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
+     if (!*x)
+     {
+         *x         = (tMPI_Spinlock_t)malloc(sizeof(tMPI_Spinlock_t));
+         (*x)->lock = (tMPI_Thread_mutex_t*)malloc(sizeof(tMPI_Thread_mutex_t));
+         tMPI_Thread_mutex_init((*x)->lock);
+     }
+     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
+ }
+ void tMPI_Spinlock_lock( tMPI_Spinlock_t *x)
+ {
+     tMPI_Spinlock_init_once(x);
+     tMPI_Thread_mutex_lock((*x)->lock);
+ }
+ void tMPI_Spinlock_unlock( tMPI_Spinlock_t *x)
+ {
+     tMPI_Spinlock_init_once(x);
+     tMPI_Thread_mutex_unlock((*x)->lock);
+ }
+ int tMPI_Spinlock_trylock( tMPI_Spinlock_t *x)
+ {
+     int ret;
+     tMPI_Spinlock_init_once(x);
+     ret = tMPI_Thread_mutex_trylock((*x)->lock);
+     return ret;
+ }
+ int tMPI_Spinlock_islocked(tMPI_Spinlock_t *x)
+ {
+     int ret;
+     tMPI_Spinlock_init_once(x);
+     ret = tMPI_Thread_mutex_trylock((*x)->lock);
+     if (ret == 0)
+     {
+         tMPI_Thread_mutex_unlock((*x)->lock);
+         ret = 0;
+     }
+     else
+     {
+         ret = 1;
+     }
+     return ret;
+ }
+ void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
+ {
+     tMPI_Spinlock_init_once(x);
+     tMPI_Spinlock_lock(x);
+     /* Got the lock now, so the waiting is over */
+     tMPI_Spinlock_unlock(x);
+ }
+ #else
+ /* just to have some symbols */
+ int _tMPI_Atomics = 1;
+ #endif
index 42ef5213d024a537feb5357ec1e40f6b46022aa6,e1c5279897c64c0d8e79a20c89bb372f85c19664..e1c5279897c64c0d8e79a20c89bb372f85c19664
@@@ -68,8 -68,8 +68,8 @@@ void tMPI_Barrier_init(tMPI_Barrier_t *
  
  int tMPI_Barrier_wait(tMPI_Barrier_t *barrier)
  {
-     int    cycle;
-     int    status;
+     int cycle;
+     int status;
  
      /* We don't need to lock or use atomic ops here, since the cycle index
       * cannot change until after the last thread has performed the check
      /* Decrement the count atomically and check if it is zero.
       * This will only be true for the last thread calling us.
       */
-     if (tMPI_Atomic_add_return( &(barrier->count), -1 ) <= 0)
+     if (tMPI_Atomic_fetch_add( &(barrier->count), -1 ) <= 1)
      {
          tMPI_Atomic_memory_barrier();
          tMPI_Atomic_set(&(barrier->count), barrier->threshold);
-         tMPI_Atomic_add_return(&(barrier->cycle), 1);
+         tMPI_Atomic_fetch_add(&(barrier->cycle), 1);
  
          status = -1;
      }
index bd8acd3a67c1ca6d3ef00f72162c90f715cba192,75ae9c08c6d05f10cec75c94db78753f58a4d6f2..75ae9c08c6d05f10cec75c94db78753f58a4d6f2
@@@ -87,8 -87,13 +87,13 @@@ int tMPI_Bcast(void* buffer, int count
      if (myrank == root)
      {
          /* first set up the data */
-         tMPI_Post_multi(cev, myrank, 0, TMPI_BCAST_TAG, datatype,
-                         count*datatype->size, buffer, comm->grp.N-1, synct, -1);
+         ret = tMPI_Post_multi(cev, myrank, 0, TMPI_BCAST_TAG, datatype,
+                               count*datatype->size, buffer, comm->grp.N-1,
+                               synct, -1);
+         if (ret != TMPI_SUCCESS)
+         {
+             return ret;
+         }
          /* and wait until everybody is done copying */
          tMPI_Wait_for_others(cev, myrank);
      }
index 069db3f1d8958947cc1d1b139f79c90d3f103c3a,5434dc425d3297a1d0c429e9c08d3aa602a544bc..5434dc425d3297a1d0c429e9c08d3aa602a544bc
  
  #ifdef USE_COLLECTIVE_COPY_BUFFER
  /* initialize a copy buffer */
void tMPI_Copy_buffer_init(struct copy_buffer *cb, size_t size)
int tMPI_Copy_buffer_init(struct copy_buffer *cb, size_t size)
  {
-     cb->buf  = tMPI_Malloc(size);
+     cb->buf = tMPI_Malloc(size);
+     if (cb->buf == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
      cb->size = size;
+     return TMPI_SUCCESS;
  }
  
  /* destroy a copy buffer */
@@@ -75,19 -80,28 +80,28 @@@ void tMPI_Copy_buffer_destroy(struct co
      free(cb->buf);
  }
  
void tMPI_Copy_buffer_list_init(struct copy_buffer_list *cbl, int Nbufs,
-                                 size_t size)
int tMPI_Copy_buffer_list_init(struct copy_buffer_list *cbl, int Nbufs,
+                                size_t size)
  {
      int i;
+     int ret;
  
      cbl->size     = size;
      cbl->cb_alloc = (struct copy_buffer*)
          tMPI_Malloc(sizeof(struct copy_buffer)*Nbufs);
+     if (cbl->cb_alloc == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
      cbl->cb    = cbl->cb_alloc; /* the first one */
      cbl->Nbufs = Nbufs;
      for (i = 0; i < Nbufs; i++)
      {
-         tMPI_Copy_buffer_init( &(cbl->cb_alloc[i]), size );
+         ret = tMPI_Copy_buffer_init( &(cbl->cb_alloc[i]), size );
+         if (ret != TMPI_SUCCESS)
+         {
+             return ret;
+         }
          if (i < Nbufs-1)
          {
              cbl->cb_alloc[i].next = &(cbl->cb_alloc[i+1]);
              cbl->cb_alloc[i].next = NULL;
          }
      }
+     return TMPI_SUCCESS;
  }
  
  void tMPI_Copy_buffer_list_destroy(struct copy_buffer_list *cbl)
@@@ -115,8 -130,8 +130,8 @@@ struct copy_buffer *tMPI_Copy_buffer_li
      struct copy_buffer *ret = cbl->cb;
      if (!ret)
      {
-         fprintf(stderr, "out of copy buffers!!");
-         exit(1);
+         tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COPY_NBUFFERS);
+         return NULL;
      }
      cbl->cb = ret->next;
  
@@@ -138,20 -153,38 +153,38 @@@ void tMPI_Copy_buffer_list_return(struc
  
  
  
void tMPI_Coll_envt_init(struct coll_env_thread *met, int N)
int tMPI_Coll_envt_init(struct coll_env_thread *met, int N)
  {
      tMPI_Atomic_set(&(met->current_sync), 0);
      tMPI_Atomic_set(&(met->n_remaining), 0);
-     met->buf       = (void**)tMPI_Malloc(sizeof(void*)*N);
-     met->bufsize   = (size_t*)tMPI_Malloc(sizeof(size_t)*N);
+     met->buf = (void**)tMPI_Malloc(sizeof(void*)*N);
+     if (met->buf == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
+     met->bufsize = (size_t*)tMPI_Malloc(sizeof(size_t)*N);
+     if (met->bufsize == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
      met->read_data = (tmpi_bool*)tMPI_Malloc(sizeof(tmpi_bool)*N);
+     if (met->read_data == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
  #ifdef USE_COLLECTIVE_COPY_BUFFER
-     met->cpbuf    = (tMPI_Atomic_ptr_t*)tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*N);
+     met->cpbuf = (tMPI_Atomic_ptr_t*)tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*
+                                                  N);
+     if (met->read_data == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
      met->cb       = NULL;
      met->using_cb = FALSE;
  #endif
      tMPI_Event_init( &(met->send_ev) );
      tMPI_Event_init( &(met->recv_ev) );
+     return TMPI_SUCCESS;
  }
  
  
@@@ -166,19 -199,29 +199,29 @@@ void tMPI_Coll_envt_destroy(struct coll
  #endif
  }
  
void tMPI_Coll_env_init(struct coll_env *cev, int N)
int tMPI_Coll_env_init(struct coll_env *cev, int N)
  {
      int i;
+     int ret;
  
      cev->met = (struct coll_env_thread*)tMPI_Malloc(
                  sizeof(struct coll_env_thread)*N);
+     if (cev->met == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
      cev->N = N;
      tMPI_Atomic_set(&(cev->coll.current_sync), 0);
      tMPI_Atomic_set(&(cev->coll.n_remaining), 0);
      for (i = 0; i < N; i++)
      {
-         tMPI_Coll_envt_init(&(cev->met[i]), N);
+         ret = tMPI_Coll_envt_init(&(cev->met[i]), N);
+         if (ret != TMPI_SUCCESS)
+         {
+             return ret;
+         }
      }
+     return TMPI_SUCCESS;
  }
  
  void tMPI_Coll_env_destroy(struct coll_env *cev)
  }
  
  
void tMPI_Coll_sync_init(struct coll_sync *csync, int N)
int tMPI_Coll_sync_init(struct coll_sync *csync, int N)
  {
      int i;
  
      csync->N     = N;
  
      csync->events = (tMPI_Event*)tMPI_Malloc(sizeof(tMPI_Event)*N);
+     if (csync->events == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
      for (i = 0; i < N; i++)
      {
          tMPI_Event_init( &(csync->events[i]) );
      }
+     return TMPI_SUCCESS;
  }
  
  void tMPI_Coll_sync_destroy(struct coll_sync *csync)
@@@ -244,8 -292,11 +292,11 @@@ struct coll_env *tMPI_Get_cev(tMPI_Com
  #ifdef USE_COLLECTIVE_COPY_BUFFER
      if (cev->met[myrank].using_cb)
      {
-         N = tMPI_Event_wait( &(cev->met[myrank].send_ev));
-         tMPI_Event_process( &(cev->met[myrank].send_ev), 1);
+         if (cev->N > 1)
+         {
+             N = tMPI_Event_wait( &(cev->met[myrank].send_ev));
+             tMPI_Event_process( &(cev->met[myrank].send_ev), 1);
+         }
      }
  #endif
  #ifdef USE_COLLECTIVE_COPY_BUFFER
@@@ -316,7 -367,7 +367,7 @@@ void tMPI_Mult_recv(tMPI_Comm comm, str
                  /* we need to try checking the pointer again after we increase
                     the read counter, signaling that one more thread
                     is reading. */
-                 tMPI_Atomic_add_return(&(cev->met[rank].buf_readcount), 1);
+                 tMPI_Atomic_fetch_add(&(cev->met[rank].buf_readcount), 1);
                  /* a full memory barrier */
                  tMPI_Atomic_memory_barrier();
                  try_again_srcbuf = tMPI_Atomic_ptr_get(
                      /* We tried again, and this time there was a copied buffer.
                         We use that, and indicate that we're not reading from the
                         regular buf. This case should be pretty rare.  */
-                     tMPI_Atomic_add_return(&(cev->met[rank].buf_readcount), -1);
+                     tMPI_Atomic_fetch_add(&(cev->met[rank].buf_readcount), -1);
                      tMPI_Atomic_memory_barrier_acq();
                      srcbuf = try_again_srcbuf;
                  }
          {
              /* we decrement the read count; potentially releasing the buffer. */
              tMPI_Atomic_memory_barrier_rel();
-             tMPI_Atomic_add_return( &(cev->met[rank].buf_readcount), -1);
+             tMPI_Atomic_fetch_add( &(cev->met[rank].buf_readcount), -1);
          }
  #endif
      }
      {
          int reta;
          tMPI_Atomic_memory_barrier_rel();
-         reta = tMPI_Atomic_add_return( &(cev->met[rank].n_remaining), -1);
-         if (reta <= 0)
+         reta = tMPI_Atomic_fetch_add( &(cev->met[rank].n_remaining), -1);
+         if (reta <= 1) /* n_remaining == 0 now. */
          {
              tMPI_Event_signal( &(cev->met[rank].send_ev) );
          }
@@@ -401,9 -452,9 +452,9 @@@ void tMPI_Coll_root_xfer(tMPI_Comm comm
      memcpy(recvbuf, sendbuf, sendsize);
  }
  
void tMPI_Post_multi(struct coll_env *cev, int myrank, int index,
-                      int tag, tMPI_Datatype datatype, size_t bufsize,
-                      void *buf, int n_remaining, int synct, int dest)
int tMPI_Post_multi(struct coll_env *cev, int myrank, int index,
+                     int tag, tMPI_Datatype datatype, size_t bufsize,
+                     void *buf, int n_remaining, int synct, int dest)
  {
      int i;
  #ifdef USE_COLLECTIVE_COPY_BUFFER
          struct tmpi_thread *cur = tMPI_Get_current();
          /* copy the buffer locally. First allocate */
          cev->met[myrank].cb = tMPI_Copy_buffer_list_get( &(cur->cbl_multi) );
+         if (cev->met[myrank].cb == NULL)
+         {
+             return TMPI_ERR_COPY_NBUFFERS;
+         }
          if (cev->met[myrank].cb->size < bufsize)
          {
-             fprintf(stderr, "ERROR: cb size too small\n");
-             exit(1);
+             return TMPI_ERR_COPY_BUFFER_SIZE;
          }
          /* copy to the new buf */
          memcpy(cev->met[myrank].cb->buf, buf, bufsize);
                              cev->met[myrank].cb->buf);
      }
  #endif
+     return TMPI_SUCCESS;
  }
  
  
@@@ -477,38 -532,45 +532,45 @@@ void tMPI_Wait_for_others(struct coll_e
      tMPI_Profile_wait_start(cur);
  #endif
  
- #ifdef USE_COLLECTIVE_COPY_BUFFER
-     if (!(cev->met[myrank].using_cb) )
- #endif
+     if (cev->N > 1)
      {
-         /* wait until everybody else is done copying the buffer */
-         tMPI_Event_wait( &(cev->met[myrank].send_ev));
-         tMPI_Event_process( &(cev->met[myrank].send_ev), 1);
-     }
  #ifdef USE_COLLECTIVE_COPY_BUFFER
-     else
-     {
-         /* wait until everybody else is done copying the original buffer.
-            We use atomic add-return because we want to be sure of coherency.
-            This wait is bound to be very short (otherwise it wouldn't
-            be double-buffering) so we always spin here. */
-         /*tMPI_Atomic_memory_barrier_rel();*/
- #if 0
-         while (!tMPI_Atomic_cas( &(cev->met[rank].buf_readcount), 0,
-                                  -100000))
+         if (!(cev->met[myrank].using_cb) )
  #endif
+         {
+             /* wait until everybody else is done copying the buffer */
+             tMPI_Event_wait( &(cev->met[myrank].send_ev));
+             tMPI_Event_process( &(cev->met[myrank].send_ev), 1);
+         }
+ #ifdef USE_COLLECTIVE_COPY_BUFFER
+         else
+         {
+             /* wait until everybody else is done copying the original buffer.
+                This wait is bound to be very short (otherwise it wouldn't
+                be double-buffering) so we always spin here. */
  #if 0
-         while (tMPI_Atomic_add_return( &(cev->met[myrank].buf_readcount), 0)
-                != 0)
+             /* dummy compare-and-swap to a value that is non-zero. The
+                atomic read with barrier below is simpler, but we keep this
+                code here commented out for if there is ever a platform
+                where the simple read doesn't work because of, say, cache
+                coherency issues. */
+             while (!tMPI_Atomic_cas( &(cev->met[rank].buf_readcount), 0,
+                                      -100000))
  #endif
  #if 1
-         while (tMPI_Atomic_get( &(cev->met[rank].buf_readcount) ) > 0)
+             tMPI_Atomic_memory_barrier();         /* a full barrier to make
+                                                      sure that the sending
+                                                      doesn't interfere with the
+                                                      waiting */
+             while (tMPI_Atomic_get( &(cev->met[myrank].buf_readcount) ) > 0)
  #endif
-         {
+             {
+                 tMPI_Atomic_memory_barrier_acq();
+             }
+             tMPI_Atomic_memory_barrier_acq();
          }
-         tMPI_Atomic_memory_barrier_acq();
-     }
  #endif
+     }
  #if defined(TMPI_PROFILE)
      tMPI_Profile_wait_stop(cur, TMPIWAIT_Coll_send);
  #endif
index 1aadef84bc02b4b9bfaa8d1cc9c18adc473e0a1b,162fda643474a7203122fcf323149a3619ed2bfe..162fda643474a7203122fcf323149a3619ed2bfe
@@@ -51,10 -51,10 +51,10 @@@ struct coll_env *tMPI_Get_cev(tMPI_Com
     synct       = the multicast sync number
     dest        = -1 for all theads, or a specific rank number.
   */
void tMPI_Post_multi(struct coll_env *cev, int myrank, int index,
-                      int tag, tMPI_Datatype datatype,
-                      size_t bufsize, void *buf, int n_remaining,
-                      int synct, int dest);
int tMPI_Post_multi(struct coll_env *cev, int myrank, int index,
+                     int tag, tMPI_Datatype datatype,
+                     size_t bufsize, void *buf, int n_remaining,
+                     int synct, int dest);
  
  /* transfer data from cev->met[rank] to recvbuf */
  void tMPI_Mult_recv(tMPI_Comm comm, struct coll_env *cev, int rank,
index 98c84e9758a778d3629c86efd17c6f7e2550201c,28464fa1ef924db5a5f0be0b7d956ad33ac17b37..28464fa1ef924db5a5f0be0b7d956ad33ac17b37
@@@ -150,31 -150,53 +150,53 @@@ int tMPI_Comm_compare(tMPI_Comm comm1, 
  }
  
  
tMPI_Comm tMPI_Comm_alloc(tMPI_Comm parent, int N)
int tMPI_Comm_alloc(tMPI_Comm *newcomm, tMPI_Comm parent, int N)
  {
-     struct tmpi_comm_ *ret;
+     struct tmpi_comm_ *retc;
      int                i;
+     int                ret;
  
-     ret            = (struct tmpi_comm_*)tMPI_Malloc(sizeof(struct tmpi_comm_));
-     ret->grp.peers = (struct tmpi_thread**)tMPI_Malloc(
+     retc = (struct tmpi_comm_*)tMPI_Malloc(sizeof(struct tmpi_comm_));
+     if (retc == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
+     retc->grp.peers = (struct tmpi_thread**)tMPI_Malloc(
                  sizeof(struct tmpi_thread*)*Nthreads);
-     ret->grp.N = N;
+     if (retc->grp.peers == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
+     retc->grp.N = N;
  
-     tMPI_Thread_mutex_init( &(ret->comm_create_lock) );
-     tMPI_Thread_cond_init( &(ret->comm_create_prep) );
-     tMPI_Thread_cond_init( &(ret->comm_create_finish) );
+     ret = tMPI_Thread_mutex_init( &(retc->comm_create_lock) );
+     if (ret != 0)
+     {
+         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+     }
+     ret = tMPI_Thread_cond_init( &(retc->comm_create_prep) );
+     if (ret != 0)
+     {
+         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+     }
+     ret = tMPI_Thread_cond_init( &(retc->comm_create_finish) );
+     if (ret != 0)
+     {
+         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+     }
  
-     ret->split    = NULL;
-     ret->new_comm = NULL;
+     retc->split    = NULL;
+     retc->new_comm = NULL;
      /* we have no topology to start out with */
-     ret->cart = NULL;
-     /*ret->graph=NULL;*/
+     retc->cart = NULL;
+     /*retc->graph=NULL;*/
  
      /* we start counting at 0 */
-     tMPI_Atomic_set( &(ret->destroy_counter), 0);
+     tMPI_Atomic_set( &(retc->destroy_counter), 0);
  
      /* initialize the main barrier */
-     tMPI_Barrier_init(&(ret->barrier), N);
+     tMPI_Barrier_init(&(retc->barrier), N);
  
      /* the reduce barriers */
      {
              Niter += 1;
          }
  
-         ret->N_reduce_iter = Niter;
+         retc->N_reduce_iter = Niter;
          /* allocate the list */
-         ret->reduce_barrier = (tMPI_Barrier_t**)
+         retc->reduce_barrier = (tMPI_Barrier_t**)
              tMPI_Malloc(sizeof(tMPI_Barrier_t*)*(Niter+1));
-         ret->N_reduce = (int*)tMPI_Malloc(sizeof(int)*(Niter+1));
+         if (retc->reduce_barrier == NULL)
+         {
+             return TMPI_ERR_NO_MEM;
+         }
+         retc->N_reduce = (int*)tMPI_Malloc(sizeof(int)*(Niter+1));
+         if (retc->N_reduce == NULL)
+         {
+             return TMPI_ERR_NO_MEM;
+         }
  
          /* we re-set Nred to N */
          Nred = N;
          {
              int j;
  
-             Nred             = Nred/2 + Nred%2;
-             ret->N_reduce[i] = Nred;
+             Nred              = Nred/2 + Nred%2;
+             retc->N_reduce[i] = Nred;
              /* allocate the sub-list */
-             ret->reduce_barrier[i] = (tMPI_Barrier_t*)
+             retc->reduce_barrier[i] = (tMPI_Barrier_t*)
                  tMPI_Malloc(sizeof(tMPI_Barrier_t)*(Nred));
+             if (retc->reduce_barrier[i] == NULL)
+             {
+                 return TMPI_ERR_NO_MEM;
+             }
              for (j = 0; j < Nred; j++)
              {
-                 tMPI_Barrier_init(&(ret->reduce_barrier[i][j]), 2);
+                 tMPI_Barrier_init(&(retc->reduce_barrier[i][j]), 2);
              }
          }
      }
  
      /* the reduce buffers */
-     ret->reduce_sendbuf = (tMPI_Atomic_ptr_t*)
+     retc->reduce_sendbuf = (tMPI_Atomic_ptr_t*)
          tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*Nthreads);
-     ret->reduce_recvbuf = (tMPI_Atomic_ptr_t*)
+     if (retc->reduce_sendbuf == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
+     retc->reduce_recvbuf = (tMPI_Atomic_ptr_t*)
          tMPI_Malloc(sizeof(tMPI_Atomic_ptr_t)*Nthreads);
+     if (retc->reduce_recvbuf == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
  
      if (parent)
      {
-         ret->erh = parent->erh;
+         retc->erh = parent->erh;
      }
      else
      {
-         ret->erh = TMPI_ERRORS_ARE_FATAL;
+         retc->erh = TMPI_ERRORS_ARE_FATAL;
      }
  
      /* coll_env objects */
-     ret->cev = (struct coll_env*)tMPI_Malloc(sizeof(struct coll_env)*N_COLL_ENV);
+     retc->cev = (struct coll_env*)tMPI_Malloc(sizeof(struct coll_env)*
+                                               N_COLL_ENV);
+     if (retc->cev == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
      for (i = 0; i < N_COLL_ENV; i++)
      {
-         tMPI_Coll_env_init( &(ret->cev[i]), N);
+         ret = tMPI_Coll_env_init( &(retc->cev[i]), N);
+         if (ret != TMPI_SUCCESS)
+         {
+             return ret;
+         }
      }
      /* multi_sync objects */
-     ret->csync = (struct coll_sync*)tMPI_Malloc(sizeof(struct coll_sync)*N);
+     retc->csync = (struct coll_sync*)tMPI_Malloc(sizeof(struct coll_sync)*N);
+     if (retc->csync == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
      for (i = 0; i < N; i++)
      {
-         tMPI_Coll_sync_init( &(ret->csync[i]), N);
+         ret = tMPI_Coll_sync_init( &(retc->csync[i]), N);
+         if (ret != TMPI_SUCCESS)
+         {
+             return ret;
+         }
      }
  
-     tMPI_Thread_mutex_lock( &(tmpi_global->comm_link_lock) );
+     ret = tMPI_Thread_mutex_lock( &(tmpi_global->comm_link_lock) );
+     if (ret != 0)
+     {
+         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+     }
      /* we insert ourselves in the circular list, after TMPI_COMM_WORLD */
      if (TMPI_COMM_WORLD)
      {
-         ret->next = TMPI_COMM_WORLD;
-         ret->prev = TMPI_COMM_WORLD->prev;
+         retc->next = TMPI_COMM_WORLD;
+         retc->prev = TMPI_COMM_WORLD->prev;
  
-         TMPI_COMM_WORLD->prev->next = ret;
-         TMPI_COMM_WORLD->prev       = ret;
+         TMPI_COMM_WORLD->prev->next = retc;
+         TMPI_COMM_WORLD->prev       = retc;
      }
      else
      {
-         ret->prev = ret->next = ret;
+         retc->prev = retc->next = retc;
+     }
+     ret = tMPI_Thread_mutex_unlock( &(tmpi_global->comm_link_lock) );
+     if (ret != 0)
+     {
+         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
      }
-     tMPI_Thread_mutex_unlock( &(tmpi_global->comm_link_lock) );
-     return ret;
+     *newcomm = retc;
+     return TMPI_SUCCESS;
  }
  
void tMPI_Comm_destroy(tMPI_Comm comm, tmpi_bool do_link_lock)
int tMPI_Comm_destroy(tMPI_Comm comm, tmpi_bool do_link_lock)
  {
      int i;
+     int ret;
  
      free(comm->grp.peers);
      for (i = 0; i < comm->N_reduce_iter; i++)
      free(comm->cev);
      free(comm->csync);
  
-     tMPI_Thread_mutex_destroy( &(comm->comm_create_lock) );
-     tMPI_Thread_cond_destroy( &(comm->comm_create_prep) );
-     tMPI_Thread_cond_destroy( &(comm->comm_create_finish) );
+     ret = tMPI_Thread_mutex_destroy( &(comm->comm_create_lock) );
+     if (ret != 0)
+     {
+         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+     }
+     ret = tMPI_Thread_cond_destroy( &(comm->comm_create_prep) );
+     if (ret != 0)
+     {
+         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+     }
+     ret = tMPI_Thread_cond_destroy( &(comm->comm_create_finish) );
+     if (ret != 0)
+     {
+         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+     }
  
      free((void*)comm->reduce_sendbuf);
      free((void*)comm->reduce_recvbuf);
      /* remove ourselves from the circular list */
      if (do_link_lock)
      {
-         tMPI_Thread_mutex_lock( &(tmpi_global->comm_link_lock) );
+         ret = tMPI_Thread_mutex_lock( &(tmpi_global->comm_link_lock) );
+         if (ret != 0)
+         {
+             return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+         }
      }
      if (comm->next)
      {
      free(comm);
      if (do_link_lock)
      {
-         tMPI_Thread_mutex_unlock( &(tmpi_global->comm_link_lock) );
+         ret = tMPI_Thread_mutex_unlock( &(tmpi_global->comm_link_lock) );
+         if (ret != 0)
+         {
+             return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+         }
      }
+     return TMPI_SUCCESS;
  }
  
  int tMPI_Comm_free(tMPI_Comm *comm)
  {
      int size;
      int sum;
+     int ret;
  #ifdef TMPI_TRACE
      tMPI_Trace_print("tMPI_Comm_free(%p)", comm);
  #endif
      if ((*comm)->grp.N > 1)
      {
          /* we remove ourselves from the comm. */
-         tMPI_Thread_mutex_lock(&((*comm)->comm_create_lock));
+         ret = tMPI_Thread_mutex_lock(&((*comm)->comm_create_lock));
+         if (ret != 0)
+         {
+             return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+         }
          (*comm)->grp.peers[myrank] = (*comm)->grp.peers[(*comm)->grp.N-1];
          (*comm)->grp.N--;
-         tMPI_Thread_mutex_unlock(&((*comm)->comm_create_lock));
+         ret = tMPI_Thread_mutex_unlock(&((*comm)->comm_create_lock));
+         if (ret != 0)
+         {
+             return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+         }
      }
      else
      {
          /* we're the last one so we can safely destroy it */
-         tMPI_Comm_destroy(*comm, TRUE);
+         ret = tMPI_Comm_destroy(*comm, TRUE);
+         if (ret != 0)
+         {
+             return ret;
+         }
      }
  #else
      /* This is correct if programs actually treat Comm_free as a collective
  
      /* we add 1 to the destroy counter and actually deallocate if the counter
         reaches N. */
-     sum = tMPI_Atomic_add_return( &((*comm)->destroy_counter), 1);
+     sum = tMPI_Atomic_fetch_add( &((*comm)->destroy_counter), 1) + 1;
      /* this is a collective call on a shared data structure, so only
         one process (the last one in this case) should do anything */
      if (sum == size)
      {
-         tMPI_Comm_destroy(*comm, TRUE);
+         ret = tMPI_Comm_destroy(*comm, TRUE);
+         if (ret != 0)
+         {
+             return ret;
+         }
      }
  #endif
      return TMPI_SUCCESS;
@@@ -456,6 -564,7 +564,7 @@@ int tMPI_Comm_split(tMPI_Comm comm, in
      tmpi_bool          i_am_first = FALSE;
      int                myrank     = tMPI_Comm_seek_rank(comm, tMPI_Get_current());
      struct tmpi_split *spl;
+     int                ret;
  
  #ifdef TMPI_TRACE
      tMPI_Trace_print("tMPI_Comm_split(%p, %d, %d, %p)", comm, color, key,
          return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM);
      }
  
-     tMPI_Thread_mutex_lock(&(comm->comm_create_lock));
+     ret = tMPI_Thread_mutex_lock(&(comm->comm_create_lock));
+     if (ret != 0)
+     {
+         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+     }
      /* first get the colors */
      if (!comm->new_comm)
      {
  
      if (spl->Ncol_init == 0)
      {
-         tMPI_Thread_cond_signal(&(comm->comm_create_prep));
+         ret = tMPI_Thread_cond_signal(&(comm->comm_create_prep));
+         if (ret != 0)
+         {
+             return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+         }
      }
  
      if (!i_am_first)
             finished */
          while (!spl->can_finish)
          {
-             tMPI_Thread_cond_wait(&(comm->comm_create_finish),
-                                   &(comm->comm_create_lock) );
+             ret = tMPI_Thread_cond_wait(&(comm->comm_create_finish),
+                                         &(comm->comm_create_lock) );
+             if (ret != 0)
+             {
+                 return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+             }
          }
      }
      else
          /*if (N>1)*/
          while (spl->Ncol_init > 0)
          {
-             tMPI_Thread_cond_wait(&(comm->comm_create_prep),
-                                   &(comm->comm_create_lock));
+             ret = tMPI_Thread_cond_wait(&(comm->comm_create_prep),
+                                         &(comm->comm_create_lock));
+             if (ret != 0)
+             {
+                 return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+             }
          }
  
          /* reset the state so that a new comm creating function can run */
          comms = (tMPI_Comm*)tMPI_Malloc(Ncomms*sizeof(tMPI_Comm));
          for (i = 0; i < Ncomms; i++)
          {
-             comms[i] = tMPI_Comm_alloc(comm, comm_N[i]);
+             ret = tMPI_Comm_alloc(&(comms[i]), comm, comm_N[i]);
+             if (ret != TMPI_SUCCESS)
+             {
+                 return ret;
+             }
          }
  
          /* now distribute the comms */
          spl->can_finish = TRUE;
  
          /* tell the waiting threads that there's a comm ready */
-         tMPI_Thread_cond_broadcast(&(comm->comm_create_finish));
+         ret = tMPI_Thread_cond_broadcast(&(comm->comm_create_finish));
+         if (ret != 0)
+         {
+             return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+         }
      }
      /* here the individual threads get their comm object */
      *newcomm = newcomm_list[myrank];
          free(spl);
      }
  
-     tMPI_Thread_mutex_unlock(&(comm->comm_create_lock));
+     ret = tMPI_Thread_mutex_unlock(&(comm->comm_create_lock));
+     if (ret != 0)
+     {
+         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+     }
  
      return TMPI_SUCCESS;
  }
index fe8d444c7000cb668139e1ceff99a1364e688d19,60f77420fb0d6080a123ddd9432b91be08d7752d..60f77420fb0d6080a123ddd9432b91be08d7752d
@@@ -77,6 -77,7 +77,7 @@@ static const char *tmpi_errmsg[] 
  {
      "No error",
      "malloc failure in tMPI (out of memory)",
+     "I/O or system error",
      "tMPI Initialization error",
      "tMPI Finalize error",
      "Invalid tMPI_Group",
      "Invalid reduce operator",
      "Out of receive envelopes: this shouldn't happen (probably a bug).",
      "Out of receive requests: this shouldn't happen (probably a bug).",
+     "Out of copy buffers: this shouldn't happen (probably a bug).",
+     "Copy buffer size too small: this shouldn't happen (probably a bug).",
+     "Error in MPI_Status",
+     "Error getting/setting processor layout/affinity",
      "Transmission failure",
      "Unknown tMPI error"
  };
@@@ -125,11 -130,29 +130,29 @@@ int tMPI_Error_string(int errorcode, ch
          errorcode = TMPI_ERR_UNKNOWN;
      }
  
+     if (errorcode != TMPI_ERR_IO)
+     {
+ #if !(defined( _WIN32 ) || defined( _WIN64 ) )
+         strncpy(strn, tmpi_errmsg[errorcode], TMPI_MAX_ERROR_STRING);
+ #else
+         strncpy_s(strn, TMPI_MAX_ERROR_STRING, tmpi_errmsg[errorcode],
+                   TMPI_MAX_ERROR_STRING);
+ #endif
+     }
+     else
+     {
  #if !(defined( _WIN32 ) || defined( _WIN64 ) )
-     strncpy(strn, tmpi_errmsg[errorcode], TMPI_MAX_ERROR_STRING);
+         snprintf(strn, TMPI_MAX_ERROR_STRING,
+                  "%s: %s", tmpi_errmsg[errorcode], strerror(errno));
  #else
-     strncpy_s(strn, TMPI_MAX_ERROR_STRING, tmpi_errmsg[errorcode], TMPI_MAX_ERROR_STRING);
+         char buf[TMPI_MAX_ERROR_STRING];
+         strerror_s(buf, TMPI_MAX_ERROR_STRING-1, errno);
+         _snprintf_s(strn, TMPI_MAX_ERROR_STRING, _TRUNCATE,
+                     "%s: %s", tmpi_errmsg[errorcode], buf);
  #endif
+     }
      *resultlen = strlen(strn);
      return TMPI_SUCCESS;
  }
@@@ -147,7 -170,7 +170,7 @@@ int tMPI_Create_errhandler(tMPI_Errhand
      if (!*errhandler)
      {
          fprintf(stderr, "tMPI fatal error (%s), bailing out\n",
-                 tmpi_errmsg[TMPI_ERR_MALLOC]);
+                 tmpi_errmsg[TMPI_ERR_NO_MEM]);
          abort();
      }
      (*errhandler)->err = 0;
index a43f2db53db039b68cedfe22f1d61bbd89124d38,54e91bf69b64430ad336ed53cd82d0eba9b529c8..54e91bf69b64430ad336ed53cd82d0eba9b529c8
@@@ -145,8 -145,12 +145,12 @@@ int tMPI_Gather(void* sendbuf, int send
          }
  
          /* first set up the data just to root. */
-         tMPI_Post_multi(cev, myrank, 0, TMPI_GATHER_TAG, sendtype,
-                         sendcount*sendtype->size, sendbuf, 1, synct, root);
+         ret = tMPI_Post_multi(cev, myrank, 0, TMPI_GATHER_TAG, sendtype,
+                               sendcount*sendtype->size, sendbuf, 1, synct, root);
+         if (ret != TMPI_SUCCESS)
+         {
+             return ret;
+         }
          /* and wait until root is done copying */
          tMPI_Wait_for_others(cev, myrank);
      }
@@@ -247,8 -251,12 +251,12 @@@ int tMPI_Gatherv(void* sendbuf, int sen
          }
  
          /* first set up the data just to root. */
-         tMPI_Post_multi(cev, myrank, 0, TMPI_GATHERV_TAG, sendtype,
-                         sendcount*sendtype->size, sendbuf, 1, synct, root);
+         ret = tMPI_Post_multi(cev, myrank, 0, TMPI_GATHERV_TAG, sendtype,
+                               sendcount*sendtype->size, sendbuf, 1, synct, root);
+         if (ret != TMPI_SUCCESS)
+         {
+             return ret;
+         }
          /* and wait until root is done copying */
          tMPI_Wait_for_others(cev, myrank);
      }
index 17b4b46661bc195f02f590555b43b668d1ecfdb9,94d0646302fe21edab3c4b72ead5f47e315da608..94d0646302fe21edab3c4b72ead5f47e315da608
@@@ -488,7 -488,7 +488,7 @@@ struct tmpi_globa
      int                     Nalloc_usertypes;
  
      /* spinlock/mutex for manipulating tmpi_user_types */
-     tMPI_Spinlock_t  datatype_lock;
+     tMPI_Spinlock_t datatype_lock;
  
      /* Lock to prevent multiple threads manipulating the linked list of comm
         structures.*/
@@@ -739,9 -739,9 +739,9 @@@ int tMPI_Comm_seek_rank(tMPI_Comm comm
  int tMPI_Comm_N(tMPI_Comm comm);
  
  /* allocate a comm object, making space for N threads */
tMPI_Comm tMPI_Comm_alloc(tMPI_Comm parent, int N);
int tMPI_Comm_alloc(tMPI_Comm *newcomm, tMPI_Comm parent, int N);
  /* de-allocate a comm object */
void tMPI_Comm_destroy(tMPI_Comm comm, tmpi_bool do_link_lock);
int tMPI_Comm_destroy(tMPI_Comm comm, tmpi_bool do_link_lock);
  /* allocate a group object */
  tMPI_Group tMPI_Group_alloc(void);
  
@@@ -756,13 -756,13 +756,13 @@@ void tMPI_Cart_destroy(struct cart_topo
  
  
  /* initialize a free envelope list with N envelopes */
void tMPI_Free_env_list_init(struct free_envelope_list *evl, int N);
int tMPI_Free_env_list_init(struct free_envelope_list *evl, int N);
  /* destroy a free envelope list */
  void tMPI_Free_env_list_destroy(struct free_envelope_list *evl);
  
  
  /* initialize a send envelope list */
void tMPI_Send_env_list_init(struct send_envelope_list *evl, int N);
int tMPI_Send_env_list_init(struct send_envelope_list *evl, int N);
  /* destroy a send envelope list */
  void tMPI_Send_env_list_destroy(struct send_envelope_list *evl);
  
  
  
  /* initialize a recv envelope list */
void tMPI_Recv_env_list_init(struct recv_envelope_list *evl);
int tMPI_Recv_env_list_init(struct recv_envelope_list *evl);
  /* destroy a recv envelope list */
  void tMPI_Recv_env_list_destroy(struct recv_envelope_list *evl);
  
  
  
  /* initialize request list */
void tMPI_Req_list_init(struct req_list *rl, int N_reqs);
int tMPI_Req_list_init(struct req_list *rl, int N_reqs);
  /* destroy request list */
  void tMPI_Req_list_destroy(struct req_list *rl);
  
  
  
  /* initialize a coll env structure */
void tMPI_Coll_env_init(struct coll_env *mev, int N);
int tMPI_Coll_env_init(struct coll_env *mev, int N);
  /* destroy a coll env structure */
  void tMPI_Coll_env_destroy(struct coll_env *mev);
  
  /* initialize a coll sync structure */
void tMPI_Coll_sync_init(struct coll_sync *msc, int N);
int tMPI_Coll_sync_init(struct coll_sync *msc, int N);
  /* destroy a coll sync structure */
  void tMPI_Coll_sync_destroy(struct coll_sync *msc);
  
  #ifdef USE_COLLECTIVE_COPY_BUFFER
  /* initialize a copy_buffer_list */
void tMPI_Copy_buffer_list_init(struct copy_buffer_list *cbl, int Nbufs,
-                                 size_t size);
int tMPI_Copy_buffer_list_init(struct copy_buffer_list *cbl, int Nbufs,
+                                size_t size);
  /* initialize a copy_buffer_list */
  void tMPI_Copy_buffer_list_destroy(struct copy_buffer_list *cbl);
  /* get a copy buffer from a list */
@@@ -811,7 -811,7 +811,7 @@@ struct copy_buffer *tMPI_Copy_buffer_li
  void tMPI_Copy_buffer_list_return(struct copy_buffer_list *cbl,
                                    struct copy_buffer      *cb);
  /* initialize a copy buffer */
void tMPI_Copy_buffer_init(struct copy_buffer *cb, size_t size);
int tMPI_Copy_buffer_init(struct copy_buffer *cb, size_t size);
  void tMPI_Copy_buffer_destroy(struct copy_buffer *cb);
  #endif
  
index 3fd19bd670dc44f3878a567644f92b30ab83c4b5,a0c51674f641cc28a053e6d5376ef46c9436cc99..a0c51674f641cc28a053e6d5376ef46c9436cc99
@@@ -83,7 -83,7 +83,7 @@@ int tMPI_Lock_trylock(tMPI_Lock_t *lock
      return tMPI_Spinlock_trylock(&(lock->lock));
  }
  
- int tMPI_Lock_islocked(const tMPI_Lock_t *lock)
+ int tMPI_Lock_islocked(tMPI_Lock_t *lock)
  {
      return tMPI_Spinlock_islocked(&(lock->lock));
  }
index 5c659d095f64ab1ba555b0d711356a973172b7a1,be248ef25a98c0acc2b7da391edeb5e9900b9e64..be248ef25a98c0acc2b7da391edeb5e9900b9e64
@@@ -141,7 -141,7 +141,7 @@@ void* tMPI_Once_wait(tMPI_Comm comm, vo
  
          tMPI_Atomic_memory_barrier_rel();
          /* signal that we're done */
-         tMPI_Atomic_add_return(&(cev->coll.current_sync), 1);
+         tMPI_Atomic_fetch_add(&(cev->coll.current_sync), 1);
          /* we need to keep being in sync */
          csync->syncs++;
      }
index c1d565cf81aa51ae1e5b3fc3b3cde5ffa3106b6d,4851b204d98b707f61a3fd8eede90e91ac3dd10e..4851b204d98b707f61a3fd8eede90e91ac3dd10e
@@@ -122,13 -122,17 +122,17 @@@ static void tMPI_Xfer(struct tmpi_threa
  
  
  /* Point-to-point communication protocol functions */
void tMPI_Free_env_list_init(struct free_envelope_list *evl, int N)
int tMPI_Free_env_list_init(struct free_envelope_list *evl, int N)
  {
      int i;
  
      /* allocate the head element */
      evl->recv_alloc_head = (struct envelope*)tMPI_Malloc(sizeof(struct envelope)
                                                           *N);
+     if (evl->recv_alloc_head == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
      evl->head_recv = evl->recv_alloc_head;
  
      for (i = 0; i < N; i++)
          evl->head_recv[i].rlist = NULL;
          evl->head_recv[i].slist = NULL;
      }
+     return TMPI_SUCCESS;
  }
  
  void tMPI_Free_env_list_destroy(struct free_envelope_list *evl)
@@@ -159,9 -164,8 +164,8 @@@ static struct envelope* tMPI_Free_env_l
      struct envelope *ret;
      if (!evl->head_recv)
      {
-         /* TODO: make this do something better than crash */
-         fprintf(stderr, "Ran out of recv envelopes!!!!\n");
-         abort();
+         tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_ENVELOPES);
+         return NULL;
      }
  
      ret            = evl->head_recv;
@@@ -194,7 -198,7 +198,7 @@@ static void tMPI_Free_env_list_return_r
  
  /* tmpi_send_envelope_list functions */
  
void tMPI_Send_env_list_init(struct send_envelope_list *evl, int N)
int tMPI_Send_env_list_init(struct send_envelope_list *evl, int N)
  {
      int i;
  #ifndef TMPI_LOCK_FREE_LISTS
      evl->Nalloc = N;
  
      evl->alloc_head = (struct envelope*)tMPI_Malloc(sizeof(struct envelope)*N);
+     if (evl->alloc_head == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
      for (i = 0; i < N; i++)
      {
          evl->alloc_head[i].next  = (i < (N-1)) ? &(evl->alloc_head[i+1]) : NULL;
          evl->alloc_head[i].slist = evl;
          evl->alloc_head[i].rlist = NULL;
  #ifdef USE_SEND_RECV_COPY_BUFFER
-         evl->alloc_head[i].cb = (void*)tMPI_Malloc(sizeof(char)*COPY_BUFFER_SIZE);
+         evl->alloc_head[i].cb = (void*)tMPI_Malloc(sizeof(char)*
+                                                    COPY_BUFFER_SIZE);
+         if (evl->alloc_head[i].cb == NULL)
+         {
+             return TMPI_ERR_NO_MEM;
+         }
  #endif
      }
  
      evl->head_old       = evl->alloc_head; /* the first element is a dummy */
      evl->head_old->next = evl->head_old;
      evl->head_old->prev = evl->head_old;
+     return TMPI_SUCCESS;
  }
  
  void tMPI_Send_env_list_destroy(struct send_envelope_list *evl)
@@@ -295,7 -309,10 +309,10 @@@ static struct envelope* tMPI_Send_env_l
          {
              /* There are no free send envelopes, so all we can do is handle
                 incoming requests until we get a free send envelope. */
+ #if defined(TMPI_DEBUG)  || defined(TMPI_WARNINGS)
              printf("Ran out of send envelopes!!\n");
+             fflush(stdout);
+ #endif
              tMPI_Wait_process_incoming(tMPI_Get_current());
          }
  #else
                 calling program. We could fix the situation by waiting,
                 but that would most likely lead to deadlocks - even
                 more difficult to debug than this. */
-             fprintf(stderr, "Ran out of send envelopes!!!!\n");
-             abort();
+             tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_ENVELOPES);
+             return NULL;
          }
  #endif
      }
@@@ -451,11 -468,13 +468,13 @@@ static void tMPI_Send_env_list_move_to_
  
  /* tmpi_recv_envelope_list functions */
  
void tMPI_Recv_env_list_init(struct recv_envelope_list *evl)
int tMPI_Recv_env_list_init(struct recv_envelope_list *evl)
  {
      evl->head       = &(evl->dummy);
      evl->head->prev = evl->head;
      evl->head->next = evl->head;
+     return TMPI_SUCCESS;
  }
  
  void tMPI_Recv_env_list_destroy(struct recv_envelope_list *evl)
@@@ -499,12 -518,16 +518,16 @@@ static void tMPI_Recv_env_list_remove(s
  
  /* tmpi_req functions */
  
void tMPI_Req_list_init(struct req_list *rl, int N_reqs)
int tMPI_Req_list_init(struct req_list *rl, int N_reqs)
  {
      int i;
  
      rl->alloc_head = (struct tmpi_req_*)tMPI_Malloc(
                  sizeof(struct tmpi_req_)*N_reqs);
+     if (rl->alloc_head == 0)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
      rl->head = rl->alloc_head;
      for (i = 0; i < N_reqs; i++)
      {
              rl->head[i].next = &(rl->head[i+1]);
          }
      }
+     return TMPI_SUCCESS;
  }
  
  void tMPI_Req_list_destroy(struct req_list *rl)
@@@ -650,8 -674,8 +674,8 @@@ tmpi_bool tMPI_Envelope_matches(const s
           ( (!rev->src)  || (rev->src == sev->src) ) &&
           ( sev->dest == rev->dest ) &&
           ( sev->datatype == rev->datatype ) &&
-          ( sev->state.value < env_finished  &&
-            rev->state.value == env_unmatched ) )
+          ( tMPI_Atomic_get(&(sev->state)) < env_finished  &&
+            tMPI_Atomic_get(&(rev->state)) == env_unmatched ) )
      {
  #ifdef TMPI_DEBUG
          printf("%5d: (%d->%d) tag=%d found match\n",
@@@ -749,8 -773,12 +773,12 @@@ void tMPI_Send_copy_buffer(struct envel
         we first need to wait until the receiver is finished copying. We
         know this is a short wait (since the buffer was small enough to be
         buffered in the first place), so we just spin-wait.  */
+     tMPI_Atomic_memory_barrier(); /* a full barrier to make sure that the
+                                      sending doesn't interfere with the
+                                      waiting */
      while (tMPI_Atomic_get( &(sev->state) ) < env_cb_available)
      {
+         tMPI_Atomic_memory_barrier_acq();
      }
      tMPI_Atomic_memory_barrier_acq();
  #ifdef TMPI_DEBUG
@@@ -776,6 -804,10 +804,10 @@@ struct envelope* tMPI_Prep_send_envelop
  {
      /* get an envelope from the send-envelope stack */
      struct envelope *ev = tMPI_Send_env_list_fetch_new( evl );
+     if (ev == NULL)
+     {
+         return NULL;
+     }
  
      ev->tag      = tag;
      ev->nonblock = nonblock;
@@@ -821,6 -853,10 +853,10 @@@ struct envelope* tMPI_Prep_recv_envelop
  {
      /* get an envelope from the stack */
      struct envelope *ev = tMPI_Free_env_list_fetch_recv( &(cur->envelopes) );
+     if (ev == NULL)
+     {
+         return NULL;
+     }
  
      ev->tag      = tag;
      ev->nonblock = nonblock;
@@@ -931,7 -967,7 +967,7 @@@ static void tMPI_Xfer(struct tmpi_threa
      tMPI_Atomic_set( &(sev->state), env_finished);
  
      /* signal to a potentially waiting thread that we're done. */
-     tMPI_Atomic_add_return( &(rev->src->ev_outgoing_received), 1);
+     tMPI_Atomic_fetch_add( &(rev->src->ev_outgoing_received), 1);
      tMPI_Event_signal(&(rev->src->p2p_event));
  
      /* remove the receiving envelope if it's in a list */
@@@ -978,6 -1014,10 +1014,10 @@@ struct envelope* tMPI_Post_match_recv(s
      /* reserve an envelope to post */
      rev = tMPI_Prep_recv_envelope(cur, comm, src, dest, recv_buf, recv_count,
                                    datatype, tag, nonblock);
+     if (rev == NULL)
+     {
+         return NULL;
+     }
  
  #ifdef TMPI_DEBUG
      printf("%5d: tMPI_Post_match_recv (%d->%d, tag=%d) started\n",
@@@ -1048,6 -1088,10 +1088,10 @@@ struct envelope *tMPI_Post_send(struct 
      /* reserve an envelope to post */
      sev = tMPI_Prep_send_envelope(sevl, comm, src, dest, send_buf, send_count,
                                    datatype, tag, nonblock);
+     if (sev == NULL)
+     {
+         return NULL;
+     }
  
  #ifdef TMPI_DEBUG
      printf("%5d: tMPI_Post_send (%d->%d, tag=%d)\n",
@@@ -1084,7 -1128,7 +1128,7 @@@ void tMPI_Wait_process_incoming(struct 
      tMPI_Profile_wait_stop(cur, TMPIWAIT_P2p);
  #endif
      n_handled = tMPI_Atomic_get(&(cur->ev_outgoing_received));
-     tMPI_Atomic_add_return( &(cur->ev_outgoing_received), -n_handled);
+     tMPI_Atomic_fetch_add( &(cur->ev_outgoing_received), -n_handled);
      check_id -= n_handled;
  
      if (check_id > 0)
index b2a305cfee8324b8140fe161f4bdaa7aa28d0a3a,3d0379cac7fc6b328dd2f930ce69ef747d9e473a..3d0379cac7fc6b328dd2f930ce69ef747d9e473a
@@@ -87,6 -87,10 +87,10 @@@ int tMPI_Send(void* buf, int count, tMP
      }
  
      sev = tMPI_Post_send(cur, comm, send_dst, buf, count, datatype, tag, FALSE);
+     if (sev == NULL)
+     {
+         return TMPI_ERR_ENVELOPES;
+     }
      tMPI_Req_init(&req, sev);
      tMPI_Wait_single(cur, &req);
  
@@@ -130,6 -134,10 +134,10 @@@ int tMPI_Recv(void* buf, int count, tMP
  
      rev = tMPI_Post_match_recv(cur, comm, recv_src, buf, count, datatype, tag,
                                 FALSE);
+     if (rev == NULL)
+     {
+         return TMPI_ERR_ENVELOPES;
+     }
      tMPI_Req_init(&req, rev);
      tMPI_Wait_single(cur, &req);
  
@@@ -185,10 -193,18 +193,18 @@@ int tMPI_Sendrecv(void *sendbuf, int se
      /* we first prepare to send */
      sev = tMPI_Post_send(cur, comm, send_dst, sendbuf, sendcount,
                           sendtype, sendtag, FALSE);
+     if (sev == NULL)
+     {
+         return TMPI_ERR_ENVELOPES;
+     }
      tMPI_Req_init(&sreq, sev);
      /* the we prepare to receive */
      rev = tMPI_Post_match_recv(cur, comm, recv_src, recvbuf, recvcount,
                                 recvtype, recvtag, FALSE);
+     if (rev == NULL)
+     {
+         return TMPI_ERR_ENVELOPES;
+     }
      tMPI_Req_init(&rreq, rev);
  
      /* fix the pointers */
@@@ -256,6 -272,10 +272,10 @@@ int tMPI_Isend(void* buf, int count, tM
          return tMPI_Error(comm, TMPI_ERR_SEND_DEST);
      }
      ev = tMPI_Post_send(cur, comm, send_dst, buf, count, datatype, tag, TRUE);
+     if (ev == NULL)
+     {
+         return TMPI_ERR_ENVELOPES;
+     }
      tMPI_Req_init(rq, ev);
      *request = rq;
  
@@@ -299,6 -319,10 +319,10 @@@ int tMPI_Irecv(void* buf, int count, tM
      }
      ev = tMPI_Post_match_recv(cur, comm, recv_src, buf, count, datatype, tag,
                                TRUE);
+     if (ev == NULL)
+     {
+         return TMPI_ERR_ENVELOPES;
+     }
      tMPI_Req_init(rq, ev);
      *request = rq;
  #ifdef TMPI_PROFILE
index a7de36be2c10a5d059f5c8bc19b519ca8574880f,c92e0fad25cdf00fc9385a988265705ebbe2ccbf..c92e0fad25cdf00fc9385a988265705ebbe2ccbf
@@@ -120,7 -120,7 +120,7 @@@ const char *tmpi_waitfn_names[] 
     stage about empty object files */
  #ifdef TMPI_PROFILE
  
void tMPI_Profile_init(struct tmpi_profile *prof)
int tMPI_Profile_init(struct tmpi_profile *prof)
  {
      int i;
  
      {
          prof->wait_cycles[i] = 0;
      }
-     prof->global_start = tmpi_cycles_read();
+     prof->global_start = tMPI_Cycles_read();
      prof->global_stop  = 0;
      prof->wait_start   = 0;
  #endif
      prof->total_p2p_xfers     = 0;
      prof->total_coll_xfers    = 0;
      tMPI_Profile_started      = 1;
+     return TMPI_SUCCESS;
  }
  
  
@@@ -162,7 -164,7 +164,7 @@@ void tMPI_Profile_destroy(struct tmpi_p
  void tMPI_Profile_stop(struct tmpi_profile *prof)
  {
  #ifdef TMPI_CYCLE_COUNT
-     prof->global_stop = tmpi_cycles_read();
+     prof->global_stop = tMPI_Cycles_read();
  #endif
      tMPI_Profile_started = 0;
  }
index 3d763f76b5883f7af9be1b6c527f25b3c3da2a42,b4066df3959af7ac4b16e87eac209274b9426019..b4066df3959af7ac4b16e87eac209274b9426019
@@@ -107,17 -107,17 +107,17 @@@ struct tmpi_profil
  
  #ifdef TMPI_CYCLE_COUNT
      /* cycle counters */
-     tmpi_cycles_t mpifn_cycles[TMPIFN_Nfunctions]; /* array of cycle counters */
-     tmpi_cycles_t wait_cycles[TMPIWAIT_N];         /* the wait cycles */
+     tMPI_Cycles_t mpifn_cycles[TMPIFN_Nfunctions]; /* array of cycle counters */
+     tMPI_Cycles_t wait_cycles[TMPIWAIT_N];         /* the wait cycles */
  
-     tmpi_cycles_t global_start, global_stop;       /* timing start and stop times */
-     tmpi_cycles_t mpifn_start;                     /* individual timing start times for profiling
+     tMPI_Cycles_t global_start, global_stop;       /* timing start and stop times */
+     tMPI_Cycles_t mpifn_start;                     /* individual timing start times for profiling
                                                        function call times.  This can be here
                                                        because tmpi_profile is thread-specific. */
      enum tmpi_functions fn;                        /* the function being cycle-counted */
  
  
-     tmpi_cycles_t wait_start; /* individual timing start times for profiling
+     tMPI_Cycles_t wait_start; /* individual timing start times for profiling
                                   wait times. */
  
      double totals;            /* totals counter for reporting end results */
  extern int tMPI_Profile_started;
  
  /* initialize the profile counter */
void tMPI_Profile_init(struct tmpi_profile *prof);
int tMPI_Profile_init(struct tmpi_profile *prof);
  
  #if 0
  /* deallocations */
@@@ -143,7 -143,7 +143,7 @@@ void tMPI_Profile_stop(struct tmpi_prof
  /* start */
  #ifdef TMPI_CYCLE_COUNT
  /*void tMPI_Profile_count_start(struct tmpi_thread *th);*/
- #define tMPI_Profile_count_start(th) { th->profile.mpifn_start = tmpi_cycles_read(); }
+ #define tMPI_Profile_count_start(th) { th->profile.mpifn_start = tMPI_Cycles_read(); }
  #else
  #define tMPI_Profile_count_start(th) {}
  #endif
  #ifdef TMPI_CYCLE_COUNT
  #define tMPI_Profile_count_stop(th, fn) \
      { \
-         tmpi_cycles_t stop = tmpi_cycles_read(); \
+         tMPI_Cycles_t stop = tMPI_Cycles_read(); \
          th->profile.mpifn_cycles[fn] += (stop - th->profile.mpifn_start); \
          (th->profile.mpifn_calls[fn])++; \
      }
  /*void tMPI_Profile_wait_start(struct tmpi_thread *th);*/
  #define tMPI_Profile_wait_start(th) \
      { \
-         th->profile.wait_start = tmpi_cycles_read(); \
+         th->profile.wait_start = tMPI_Cycles_read(); \
      }
  
  /* stop waiting cycle count */
                              enum tmpi_wait_functions fn);*/
  #define tMPI_Profile_wait_stop(th, fn) \
      { \
-         tmpi_cycles_t wait_stop = tmpi_cycles_read(); \
+         tMPI_Cycles_t wait_stop = tMPI_Cycles_read(); \
          th->profile.wait_cycles[fn] += (wait_stop - th->profile.wait_start); \
      }
  #else
index 948ae78aa72b3de39b3c18ad28579e89323df6b3,85c25c99620335747b86935fee028f08b1d9cfea..85c25c99620335747b86935fee028f08b1d9cfea
@@@ -94,20 -94,6 +94,6 @@@ static int             thread_id_key_in
  
  
  
- /* TODO: this needs to go away!  (there's another one in winthreads.c)
-    fatal errors are thankfully really rare*/
- void tMPI_Fatal_error(const char *file, int line, const char *message, ...)
- {
-     va_list ap;
-     fprintf(stderr, "tMPI Fatal error in %s, line %d: ", file, line);
-     va_start(ap, message);
-     vfprintf(stderr, message, ap);
-     va_end(ap);
-     fprintf(stderr, "\n");
-     abort();
- }
  
  enum tMPI_Thread_support tMPI_Thread_support(void)
  {
@@@ -146,22 -132,46 +132,46 @@@ static void tMPI_Destroy_thread_id(void
  }
  
  /* initialize the thread id vars if not already initialized */
- static void tMPI_Init_thread_ids(void)
+ static int tMPI_Init_thread_ids(void)
  {
-     pthread_mutex_lock( &thread_id_mutex );
+     int ret;
+     ret = pthread_mutex_lock( &thread_id_mutex );
+     if (ret != 0)
+     {
+         return ret;
+     }
      if (!thread_id_key_initialized)
      {
          /* initialize and set the thread id thread-specific variable */
          struct tMPI_Thread *main_thread;
  
          thread_id_key_initialized = 1;
-         pthread_key_create(&thread_id_key, tMPI_Destroy_thread_id);
-         main_thread                  = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
+         ret = pthread_key_create(&thread_id_key, tMPI_Destroy_thread_id);
+         if (ret != 0)
+         {
+             goto err;
+         }
+         main_thread = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
+         if (main_thread == NULL)
+         {
+             ret = ENOMEM;
+             goto err;
+         }
          main_thread->th              = pthread_self();
          main_thread->started_by_tmpi = 0;
-         pthread_setspecific(thread_id_key, main_thread);
+         ret = pthread_setspecific(thread_id_key, main_thread);
+         if (ret != 0)
+         {
+             goto err;
+         }
      }
+     ret = pthread_mutex_unlock( &thread_id_mutex );
+     return ret;
+ err:
      pthread_mutex_unlock( &thread_id_mutex );
+     return ret;
  }
  
  /* structure to hold the arguments for the thread_starter function */
@@@ -178,8 -188,13 +188,13 @@@ static void *tMPI_Thread_starter(void *
      struct tMPI_Thread_starter *starter = (struct tMPI_Thread_starter *)arg;
      void *(*start_routine)(void*);
      void *parg;
+     int   ret;
  
-     pthread_setspecific(thread_id_key, starter->thread);
+     ret = pthread_setspecific(thread_id_key, starter->thread);
+     if (ret != 0)
+     {
+         return NULL;
+     }
      start_routine = starter->start_routine;
      parg          = starter->arg;
  
@@@ -195,34 -210,31 +210,31 @@@ int tMPI_Thread_create(tMPI_Thread_t *t
  
      if (thread == NULL)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Invalid thread pointer.");
          return EINVAL;
      }
      tMPI_Init_thread_ids();
  
-     *thread                    = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
+     *thread = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
+     if (*thread == NULL)
+     {
+         return ENOMEM;
+     }
      (*thread)->started_by_tmpi = 1;
      starter                    = (struct tMPI_Thread_starter*)
          malloc(sizeof(struct tMPI_Thread_starter)*1);
+     if (starter == NULL)
+     {
+         return ENOMEM;
+     }
      /* fill the starter structure */
      starter->thread        = *thread;
      starter->start_routine = start_routine;
      starter->arg           = arg;
  
-     /*ret=pthread_create(&((*thread)->th),NULL,start_routine,arg);*/
      ret = pthread_create(&((*thread)->th), NULL, tMPI_Thread_starter,
                           (void*)starter);
  
-     if (ret != 0)
-     {
-         /* Cannot use tMPI_error() since messages use threads for locking */
-         tMPI_Fatal_error(TMPI_FARGS, "Failed to create POSIX thread:%s, rc=%d",
-                          strerror(errno), ret);
-         /* Use system memory allocation routines */
-         return -1;
-     }
-     return 0;
+     return ret;
  }
  
  
@@@ -232,23 -244,27 +244,27 @@@ int tMPI_Thread_join(tMPI_Thread_t thre
      int       ret;
      pthread_t th = thread->th;
  
      ret = pthread_join( th, value_ptr );
-     free(thread);
      if (ret != 0)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Failed to join POSIX thread. rc=%d", ret);
+         return ret;
      }
-     return ret;
+     free(thread);
+     return 0;
  }
  
  
  tMPI_Thread_t tMPI_Thread_self(void)
  {
      tMPI_Thread_t th;
+     int           ret;
      /* make sure the key var is set */
-     tMPI_Init_thread_ids();
+     ret = tMPI_Init_thread_ids();
+     if (ret != 0)
+     {
+         return NULL;
+     }
  
      th = pthread_getspecific(thread_id_key);
  
      if (th == NULL)
      {
          /* if not, create an ID, set it and return it */
-         th                  = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
-         th->started_by_tmpi = 0;
+         th = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
+         if (th == NULL)
+         {
+             return NULL;
+         }
          th->th              = pthread_self();
+         th->started_by_tmpi = 0;
+         /* we ignore errors here because they're not important -
+            the next iteration will do the same thing. */
          pthread_setspecific(thread_id_key, th);
      }
      return th;
@@@ -323,32 -345,60 +345,60 @@@ int tMPI_Thread_mutex_init(tMPI_Thread_
          return EINVAL;
      }
  
-     mtx->mutex = (struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1);
-     ret        = pthread_mutex_init(&(mtx->mutex->mtx), NULL);
+     mtx->mutex = (struct tMPI_Mutex*)malloc(sizeof(struct tMPI_Mutex)*1);
+     if (mtx->mutex == NULL)
+     {
+         return ENOMEM;
+     }
+     ret = pthread_mutex_init(&(mtx->mutex->mtx), NULL);
      if (ret != 0)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Error initializing POSIX mutex. rc=%d");
-         /* Use system memory allocation routines */
          return ret;
      }
  
+ #ifndef TMPI_NO_ATOMICS
      tMPI_Atomic_set(&(mtx->initialized), 1);
+ #else
+     mtx->initialized.value = 1;
+ #endif
      return 0;
  }
  
- static int tMPI_Thread_mutex_init_once(tMPI_Thread_mutex_t *mtx)
+ static inline int tMPI_Thread_mutex_init_once(tMPI_Thread_mutex_t *mtx)
  {
      int ret = 0;
  
-     /* we're relying on the memory barrier semantics of mutex_lock/unlock
-        for the check preceding this function call to have worked */
-     pthread_mutex_lock( &(mutex_init) );
-     if (mtx->mutex == NULL)
+ #ifndef TMPI_NO_ATOMICS
+     /* check whether the mutex is initialized */
+     if (tMPI_Atomic_get( &(mtx->initialized)  ) == 0)
+ #endif
      {
-         mtx->mutex = (struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1);
-         ret        = pthread_mutex_init( &(mtx->mutex->mtx), NULL);
+         /* we're relying on the memory barrier semantics of mutex_lock/unlock
+            for the check preceding this function call to have worked */
+         ret = pthread_mutex_lock( &(mutex_init) );
+         if (ret != 0)
+         {
+             return ret;
+         }
+         if (mtx->mutex == NULL)
+         {
+             mtx->mutex = (struct tMPI_Mutex*)malloc(sizeof(struct tMPI_Mutex));
+             if (mtx->mutex == NULL)
+             {
+                 ret = ENOMEM;
+                 goto err;
+             }
+             ret = pthread_mutex_init( &(mtx->mutex->mtx), NULL);
+             if (ret != 0)
+             {
+                 goto err;
+             }
+         }
      }
+     ret = pthread_mutex_unlock( &(mutex_init) );
+     return ret;
+ err:
      pthread_mutex_unlock( &(mutex_init) );
      return ret;
  }
@@@ -364,13 -414,11 +414,11 @@@ int tMPI_Thread_mutex_destroy(tMPI_Thre
      }
  
      ret = pthread_mutex_destroy( &(mtx->mutex->mtx) );
-     free(mtx->mutex);
      if (ret != 0)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Error destroying POSIX mutex. rc=%d", ret);
-         /* Use system memory allocation routines */
+         return ret;
      }
+     free(mtx->mutex);
      return ret;
  }
  
@@@ -381,17 -429,13 +429,13 @@@ int tMPI_Thread_mutex_lock(tMPI_Thread_
      int ret;
  
      /* check whether the mutex is initialized */
-     if (tMPI_Atomic_get( &(mtx->initialized)  ) == 0)
+     ret = tMPI_Thread_mutex_init_once(mtx);
+     if (ret != 0)
      {
-         ret = tMPI_Thread_mutex_init_once(mtx);
-         if (ret)
-         {
-             return ret;
-         }
+         return ret;
      }
  
      ret = pthread_mutex_lock(&(mtx->mutex->mtx));
      return ret;
  }
  
@@@ -403,17 -447,13 +447,13 @@@ int tMPI_Thread_mutex_trylock(tMPI_Thre
      int ret;
  
      /* check whether the mutex is initialized */
-     if (tMPI_Atomic_get( &(mtx->initialized)  ) == 0)
+     ret = tMPI_Thread_mutex_init_once(mtx);
+     if (ret != 0)
      {
-         ret = tMPI_Thread_mutex_init_once(mtx);
-         if (ret)
-         {
-             return ret;
-         }
+         return ret;
      }
  
      ret = pthread_mutex_trylock(&(mtx->mutex->mtx));
      return ret;
  }
  
@@@ -424,17 -464,13 +464,13 @@@ int tMPI_Thread_mutex_unlock(tMPI_Threa
      int ret;
  
      /* check whether the mutex is initialized */
-     if (tMPI_Atomic_get( &(mtx->initialized)  ) == 0)
+     ret = tMPI_Thread_mutex_init_once(mtx);
+     if (ret != 0)
      {
-         ret = tMPI_Thread_mutex_init_once(mtx);
-         if (ret)
-         {
-             return ret;
-         }
+         return ret;
      }
  
      ret = pthread_mutex_unlock(&(mtx->mutex->mtx));
      return ret;
  }
  
@@@ -446,19 -482,20 +482,20 @@@ int tMPI_Thread_key_create(tMPI_Thread_
  
      if (key == NULL)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Invalid key pointer.");
          return EINVAL;
      }
  
  
-     key->key = (struct tMPI_Thread_key*)tMPI_Malloc(sizeof(struct
-                                                            tMPI_Thread_key)*1);
+     key->key = (struct tMPI_Thread_key*)malloc(sizeof(struct
+                                                       tMPI_Thread_key)*1);
+     if (key->key == NULL)
+     {
+         return ENOMEM;
+     }
      ret = pthread_key_create(&((key)->key->pkey), destructor);
      if (ret != 0)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Failed to create thread key, rc=%d.", ret);
-         fflush(stderr);
-         return -1;
+         return ret;
      }
  
      tMPI_Atomic_set(&(key->initialized), 1);
@@@ -471,15 -508,13 +508,13 @@@ int tMPI_Thread_key_delete(tMPI_Thread_
      int ret;
  
      ret = pthread_key_delete((key.key->pkey));
-     free(key.key);
      if (ret != 0)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Failed to delete thread key, rc=%d.", ret);
-         fflush(stderr);
+         return ret;
      }
+     free(key.key);
  
-     return ret;
+     return 0;
  }
  
  
@@@ -514,7 -549,8 +549,8 @@@ int tMPI_Thread_once(tMPI_Thread_once_
      }
  
      /* really ugly hack - and it's slow... */
-     if ( (ret = pthread_mutex_lock( &once_init )) )
+     ret = pthread_mutex_lock( &once_init );
+     if (ret != 0)
      {
          return ret;
      }
          (*init_routine)();
          tMPI_Atomic_set(&(once_control->once), 1);
      }
-     pthread_mutex_unlock( &once_init );
+     ret = pthread_mutex_unlock( &once_init );
  
-     return 0;
+     return ret;
  }
  
  
@@@ -540,17 -576,22 +576,22 @@@ int tMPI_Thread_cond_init(tMPI_Thread_c
          return EINVAL;
      }
  
-     cond->condp = (struct tMPI_Thread_cond*)
-         tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1);
-     ret = pthread_cond_init(&(cond->condp->cond), NULL);
+     cond->condp = (struct tMPI_Thread_cond*)malloc(
+                 sizeof(struct tMPI_Thread_cond));
+     if (cond->condp == NULL)
+     {
+         return ENOMEM;
+     }
  
+     ret = pthread_cond_init(&(cond->condp->cond), NULL);
      if (ret != 0)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Error initializing POSIX condition variable. rc=%d", ret);
-         fflush(stderr);
+         return ret;
      }
      tMPI_Atomic_set(&(cond->initialized), 1);
-     return ret;
+     tMPI_Atomic_memory_barrier();
+     return 0;
  }
  
  
@@@ -560,13 -601,30 +601,30 @@@ static int tMPI_Thread_cond_init_once(t
  
      /* we're relying on the memory barrier semantics of mutex_lock/unlock
         for the check preceding this function call to have worked */
-     pthread_mutex_lock( &(cond_init) );
+     ret = pthread_mutex_lock( &(cond_init) );
+     if (ret != 0)
+     {
+         return ret;
+     }
      if (cond->condp == NULL)
      {
          cond->condp = (struct tMPI_Thread_cond*)
-             tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1);
+             malloc(sizeof(struct tMPI_Thread_cond)*1);
+         if (cond->condp == NULL)
+         {
+             ret = ENOMEM;
+             goto err;
+         }
          ret = pthread_cond_init( &(cond->condp->cond), NULL);
+         if (ret != 0)
+         {
+             goto err;
+         }
      }
+     ret = pthread_mutex_unlock( &(cond_init) );
+     return ret;
+ err:
+     /* try to unlock anyway */
      pthread_mutex_unlock( &(cond_init) );
      return ret;
  }
@@@ -583,16 -641,13 +641,13 @@@ int tMPI_Thread_cond_destroy(tMPI_Threa
      }
  
      ret = pthread_cond_destroy(&(cond->condp->cond));
-     free(cond->condp);
      if (ret != 0)
      {
-         tMPI_Fatal_error(TMPI_FARGS,
-                          "Error destroying POSIX condition variable. rc=%d",
-                          ret);
-         fflush(stderr);
+         return ret;
      }
-     return ret;
+     free(cond->condp);
+     return 0;
  }
  
  
@@@ -603,7 -658,11 +658,11 @@@ int tMPI_Thread_cond_wait(tMPI_Thread_c
      /* check whether the condition is initialized */
      if (tMPI_Atomic_get( &(cond->initialized)  ) == 0)
      {
-         tMPI_Thread_cond_init_once(cond);
+         ret = tMPI_Thread_cond_init_once(cond);
+         if (ret != 0)
+         {
+             return ret;
+         }
      }
      /* the mutex must have been initialized because it should be locked here */
  
@@@ -622,7 -681,11 +681,11 @@@ int tMPI_Thread_cond_signal(tMPI_Thread
      /* check whether the condition is initialized */
      if (tMPI_Atomic_get( &(cond->initialized)  ) == 0)
      {
-         tMPI_Thread_cond_init_once(cond);
+         ret = tMPI_Thread_cond_init_once(cond);
+         if (ret != 0)
+         {
+             return ret;
+         }
      }
  
      ret = pthread_cond_signal( &(cond->condp->cond) );
@@@ -639,7 -702,11 +702,11 @@@ int tMPI_Thread_cond_broadcast(tMPI_Thr
      /* check whether the condition is initialized */
      if (tMPI_Atomic_get( &(cond->initialized)  ) == 0)
      {
-         tMPI_Thread_cond_init_once(cond);
+         ret = tMPI_Thread_cond_init_once(cond);
+         if (ret != 0)
+         {
+             return ret;
+         }
      }
  
      ret = pthread_cond_broadcast( &(cond->condp->cond) );
@@@ -675,23 -742,21 +742,21 @@@ int tMPI_Thread_barrier_init(tMPI_Threa
      }
  
      barrier->barrierp = (struct tMPI_Thread_barrier*)
-         tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1);
-     ret = pthread_mutex_init(&(barrier->barrierp->mutex), NULL);
+         malloc(sizeof(struct tMPI_Thread_barrier)*1);
+     if (barrier->barrierp == NULL)
+     {
+         return ENOMEM;
+     }
  
+     ret = pthread_mutex_init(&(barrier->barrierp->mutex), NULL);
      if (ret != 0)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Error initializing POSIX mutex. rc=%d",
-                          ret);
          return ret;
      }
  
      ret = pthread_cond_init(&(barrier->barrierp->cv), NULL);
      if (ret != 0)
      {
-         tMPI_Fatal_error(TMPI_FARGS,
-                          "Error initializing POSIX condition variable. rc=%d",
-                          ret);
          return ret;
      }
  
@@@ -709,30 -774,39 +774,39 @@@ static int tMPI_Thread_barrier_init_onc
  
      /* we're relying on the memory barrier semantics of mutex_lock/unlock
         for the check preceding this function call to have worked */
-     pthread_mutex_lock( &(barrier_init) );
+     ret = pthread_mutex_lock( &(barrier_init) );
+     if (ret != 0)
+     {
+         return ret;
+     }
      if (barrier->barrierp == NULL)
      {
          barrier->barrierp = (struct tMPI_Thread_barrier*)
-             tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1);
+             malloc(sizeof(struct tMPI_Thread_barrier)*1);
+         if (barrier->barrierp == NULL)
+         {
+             ret = ENOMEM;
+             goto err;
+         }
          ret = pthread_mutex_init(&(barrier->barrierp->mutex), NULL);
  
          if (ret != 0)
          {
-             tMPI_Fatal_error(TMPI_FARGS, "Error initializing POSIX mutex. rc=%d",
-                              ret);
-             return ret;
+             goto err;
          }
  
          ret = pthread_cond_init(&(barrier->barrierp->cv), NULL);
  
          if (ret != 0)
          {
-             tMPI_Fatal_error(TMPI_FARGS,
-                              "Error initializing POSIX condition variable. rc=%d",
-                              ret);
-             return ret;
+             goto err;
          }
      }
+     ret = pthread_mutex_unlock( &(barrier_init) );
+     return ret;
+ err:
      pthread_mutex_unlock( &(barrier_init) );
      return ret;
  }
  
  int tMPI_Thread_barrier_destroy(tMPI_Thread_barrier_t *barrier)
  {
+     int ret;
      if (barrier == NULL)
      {
          return EINVAL;
      }
  
-     pthread_mutex_destroy(&(barrier->barrierp->mutex));
-     pthread_cond_destroy(&(barrier->barrierp->cv));
+     ret = pthread_mutex_destroy(&(barrier->barrierp->mutex));
+     if (ret != 0)
+     {
+         return ret;
+     }
+     ret = pthread_cond_destroy(&(barrier->barrierp->cv));
+     if (ret != 0)
+     {
+         return ret;
+     }
  
      free(barrier->barrierp);
  
  }
  
  
- int tMPI_Thread_barrier_wait(tMPI_Thread_barrier_t *   barrier)
+ int tMPI_Thread_barrier_wait(tMPI_Thread_barrier_t * barrier)
  {
-     int    cycle;
-     int    rc;
+     int cycle;
+     int ret;
  
      /* check whether the barrier is initialized */
      if (tMPI_Atomic_get( &(barrier->initialized)  ) == 0)
      }
  
  
-     rc = pthread_mutex_lock(&barrier->barrierp->mutex);
-     if (rc != 0)
+     ret = pthread_mutex_lock(&barrier->barrierp->mutex);
+     if (ret != 0)
      {
-         return EBUSY;
+         return ret;
      }
  
      cycle = barrier->cycle;
      {
          barrier->cycle = !barrier->cycle;
          barrier->count = barrier->threshold;
-         r            = pthread_cond_broadcast(&barrier->barrierp->cv);
+         ret            = pthread_cond_broadcast(&barrier->barrierp->cv);
  
-         if (rc == 0)
+         if (ret == 0)
          {
-             rc = -1;
+             goto err;
          }
      }
      else
      {
          while (cycle == barrier->cycle)
          {
-             rc = pthread_cond_wait(&barrier->barrierp->cv,
-                                    &barrier->barrierp->mutex);
-             if (rc != 0)
+             ret = pthread_cond_wait(&barrier->barrierp->cv,
+                                     &barrier->barrierp->mutex);
+             if (ret != 0)
              {
-                 break;
+                 goto err;
              }
          }
      }
  
+     ret = pthread_mutex_unlock(&barrier->barrierp->mutex);
+     return ret;
+ err:
      pthread_mutex_unlock(&barrier->barrierp->mutex);
-     return rc;
+     return ret;
  }
  
  #else
index dab23f0becdfe0069b0227d48056fe3c17113ffa,807f85a3959255e769b1bd306cf59c7dbace76aa..807f85a3959255e769b1bd306cf59c7dbace76aa
@@@ -149,8 -149,7 +149,7 @@@ int tMPI_Scatter(void* sendbuf, int sen
                          &(tMPI_Get_thread(comm, myrank)->cbl_multi));
              if (cev->met[myrank].cb->size < total_send_size)
              {
-                 fprintf(stderr, "ERROR: cb size too small\n");
-                 exit(1);
+                 return tMPI_Error(comm, TMPI_ERR_COPY_BUFFER_SIZE);
              }
              /* copy to the new buf */
              memcpy(cev->met[myrank].cb->buf, sendbuf, total_send_size);
@@@ -286,8 -285,7 +285,7 @@@ int tMPI_Scatterv(void* sendbuf, int *s
                          &(tMPI_Get_thread(comm, myrank)->cbl_multi));
              if (cev->met[myrank].cb->size < total_send_size)
              {
-                 fprintf(stderr, "ERROR: cb size too small\n");
-                 exit(1);
+                 return tMPI_Error(comm, TMPI_ERR_COPY_BUFFER_SIZE);
              }
              /* copy to the new buf */
              memcpy(cev->met[myrank].cb->buf, sendbuf, total_send_size);
index e6170d497ae80e5a0c51ac541f58c37acde0d9ef,986a516b27caacee090d17b738b0b6ecfd67f7e0..986a516b27caacee090d17b738b0b6ecfd67f7e0
  
  /* whether to warn if there are mallocs at performance-critical sections
     (due to preallocations being too small) */
+ #ifdef TMPI_WARNINGS
+ #define TMPI_WARN_MALLOC
+ #else
  /*#define TMPI_WARN_MALLOC*/
+ #endif
  
  
  /* the number of envelopes to allocate per thread-to-thread path */
index 428aebe256e27483a809e2184a5cc7b9b66de2ad,88bcb84f8bad7f979341e3cf7f97e03376b8a8b5..88bcb84f8bad7f979341e3cf7f97e03376b8a8b5
  #include <cstring>
  #include <cstdlib>
  #include <stdexcept>
+ #include <string>
  #include "thread_mpi/system_error.h"
  
  tMPI::system_error::system_error(error_code ec)
-     : runtime_error(std::strerror(ec)), ec_(ec)
+     : runtime_error(std::string(std::strerror(ec))), ec_(ec)
  {
  }
  
index 4fa6169513aa9fdd0f078c6aab38ccdda6a1c932,4d15b931de4c32bc5aa86bc670e51c67fc2d8846..4d15b931de4c32bc5aa86bc670e51c67fc2d8846
@@@ -98,18 -98,18 +98,18 @@@ struct tmpi_global *tmpi_global = NULL
  
  
  /* start N threads with argc, argv (used by tMPI_Init)*/
void tMPI_Start_threads(tmpi_bool main_returns, int N,
-                         tMPI_Affinity_strategy aff_strategy,
-                         int *argc, char ***argv,
-                         void (*start_fn)(void*), void *start_arg,
-                         int (*start_fn_main)(int, char**));
int tMPI_Start_threads(tmpi_bool main_returns, int N,
+                        tMPI_Affinity_strategy aff_strategy,
+                        int *argc, char ***argv,
+                        void (*start_fn)(void*), void *start_arg,
+                        int (*start_fn_main)(int, char**));
  
  /* starter function for threads; takes a void pointer to a
        struct tmpi_starter_, which calls main() if tmpi_start_.fn == NULL */
  static void* tMPI_Thread_starter(void *arg);
  
  /* allocate and initialize the data associated with a thread structure */
- static void tMPI_Thread_init(struct tmpi_thread *th);
+ static int tMPI_Thread_init(struct tmpi_thread *th);
  /* deallocate the data associated with a thread structure */
  static void tMPI_Thread_destroy(struct tmpi_thread *th);
  
@@@ -123,6 -123,7 +123,7 @@@ void tMPI_Trace_print(const char *fmt, 
      struct tmpi_thread       * th  = NULL;
      static tMPI_Thread_mutex_t mtx = TMPI_THREAD_MUTEX_INITIALIZER;
  
+     /* don't check for errors during trace */
      tMPI_Thread_mutex_lock(&mtx);
      if (threads)
      {
  #endif
  
  
- #if 0
- struct tmpi_thread *tMPI_Get_current(void)
- {
-     if (!threads)
-     {
-         return NULL;
-     }
-     return (struct tmpi_thread*)tMPI_thread_getspecific(id_key);
- }
- unsigned int tMPI_Threadnr(struct tmpi_thread *thr)
- {
-     return thr-threads;
- }
- #endif
- #if 0
- unsigned int tMPI_This_threadnr(void)
- {
-     return tMPI_Get_current()-threads;
- }
- struct tmpi_thread *tMPI_Get_thread(tMPI_Comm comm, int rank)
- {
-     /* check destination */
-     if ( (rank < 0) || (rank > comm->grp.N) )
-     {
-         tMPI_Error(comm, TMPI_ERR_GROUP_RANK);
-         return NULL;
-     }
-     return comm->grp.peers[rank];
- }
- #endif
  tmpi_bool tMPI_Is_master(void)
  {
      /* if there are no other threads, we're the main thread */
@@@ -243,30 -209,55 +209,55 @@@ int tMPI_Get_N(int *argc, char ***argv
      return ret;
  }
  
- static void tMPI_Thread_init(struct tmpi_thread *th)
+ static int tMPI_Thread_init(struct tmpi_thread *th)
  {
+     int ret;
      int N_envelopes      = (Nthreads+1)*N_EV_ALLOC;
      int N_send_envelopes = N_EV_ALLOC;
      int N_reqs           = (Nthreads+1)*N_EV_ALLOC;
      int i;
  
      /* we set our thread id, as a thread-specific piece of global data. */
-     tMPI_Thread_setspecific(id_key, th);
+     ret = tMPI_Thread_setspecific(id_key, th);
+     if (ret != 0)
+     {
+         return ret;
+     }
  
      /* allocate comm.self */
-     th->self_comm               = tMPI_Comm_alloc(TMPI_COMM_WORLD, 1);
+     ret = tMPI_Comm_alloc( &(th->self_comm), TMPI_COMM_WORLD, 1);
+     if (ret != TMPI_SUCCESS)
+     {
+         return ret;
+     }
      th->self_comm->grp.peers[0] = th;
  
      /* allocate envelopes */
-     tMPI_Free_env_list_init( &(th->envelopes), N_envelopes );
+     ret = tMPI_Free_env_list_init( &(th->envelopes), N_envelopes );
+     if (ret != TMPI_SUCCESS)
+     {
+         return ret;
+     }
      /* recv list */
-     tMPI_Recv_env_list_init( &(th->evr));
+     ret = tMPI_Recv_env_list_init( &(th->evr));
+     if (ret != TMPI_SUCCESS)
+     {
+         return ret;
+     }
      /* send lists */
      th->evs = (struct send_envelope_list*)tMPI_Malloc(
                  sizeof(struct send_envelope_list)*Nthreads);
+     if (th->evs == NULL)
+     {
+         return TMPI_ERR_NO_MEM;
+     }
      for (i = 0; i < Nthreads; i++)
      {
-         tMPI_Send_env_list_init( &(th->evs[i]), N_send_envelopes);
+         ret = tMPI_Send_env_list_init( &(th->evs[i]), N_send_envelopes);
+         if (ret != TMPI_SUCCESS)
+         {
+             return ret;
+         }
      }
  
      tMPI_Atomic_set( &(th->ev_outgoing_received), 0);
      tMPI_Event_init( &(th->p2p_event) );
  
      /* allocate requests */
-     tMPI_Req_list_init(&(th->rql), N_reqs);
+     ret = tMPI_Req_list_init(&(th->rql), N_reqs);
+     if (ret != TMPI_SUCCESS)
+     {
+         return ret;
+     }
  
  #ifdef USE_COLLECTIVE_COPY_BUFFER
      /* allcate copy_buffer list */
-     tMPI_Copy_buffer_list_init(&(th->cbl_multi), (Nthreads+1)*(N_COLL_ENV+1),
-                                Nthreads*COPY_BUFFER_SIZE);
+     ret = tMPI_Copy_buffer_list_init(&(th->cbl_multi),
+                                      (Nthreads+1)*(N_COLL_ENV+1),
+                                      Nthreads*COPY_BUFFER_SIZE);
+     if (ret != TMPI_SUCCESS)
+     {
+         return ret;
+     }
  #endif
  
  #ifdef TMPI_PROFILE
-     tMPI_Profile_init(&(th->profile));
+     ret = tMPI_Profile_init(&(th->profile));
+     if (ret != TMPI_SUCCESS)
+     {
+         return ret;
+     }
  #endif
      /* now wait for all other threads to come on line, before we
         start the MPI program */
-     tMPI_Thread_barrier_wait( &(tmpi_global->barrier) );
+     ret = tMPI_Thread_barrier_wait( &(tmpi_global->barrier) );
+     if (ret != 0)
+     {
+         return ret;;
+     }
+     return ret;
  }
  
  
@@@ -315,17 -325,32 +325,32 @@@ static void tMPI_Thread_destroy(struct 
      }
  }
  
- static void tMPI_Global_init(struct tmpi_global *g, int Nthreads)
+ static int tMPI_Global_init(struct tmpi_global *g, int Nthreads)
  {
+     int ret;
      g->usertypes        = NULL;
      g->N_usertypes      = 0;
      g->Nalloc_usertypes = 0;
-     tMPI_Thread_mutex_init(&(g->timer_mutex));
+     ret                 = tMPI_Thread_mutex_init(&(g->timer_mutex));
+     if (ret != 0)
+     {
+         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+     }
      tMPI_Spinlock_init(&(g->datatype_lock));
  
-     tMPI_Thread_barrier_init( &(g->barrier), Nthreads);
+     ret = tMPI_Thread_barrier_init( &(g->barrier), Nthreads);
+     if (ret != 0)
+     {
+         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+     }
+     ret = tMPI_Thread_mutex_init(&(g->comm_link_lock));
+     if (ret != 0)
+     {
+         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+     }
  
-     tMPI_Thread_mutex_init(&(g->comm_link_lock));
  
  #if !(defined( _WIN32 ) || defined( _WIN64 ) )
      /* the time at initialization. */
      /* the time at initialization. */
      g->timer_init = GetTickCount();
  #endif
+     return TMPI_SUCCESS;
  }
  
  static void tMPI_Global_destroy(struct tmpi_global *g)
  
  static void* tMPI_Thread_starter(void *arg)
  {
+     int                 ret;
      struct tmpi_thread *th = (struct tmpi_thread*)arg;
  
  #ifdef TMPI_TRACE
      tMPI_Trace_print("Created thread nr. %d", (int)(th-threads));
  #endif
  
-     tMPI_Thread_init(th);
+     ret = tMPI_Thread_init(th);
+     if (ret != TMPI_SUCCESS)
+     {
+         return NULL;
+     }
  
      /* start_fn, start_arg, argc and argv were set by the calling function */
      if (!th->start_fn)
          }
      }
  
-     return 0;
+     return NULL;
  }
  
  
void tMPI_Start_threads(tmpi_bool main_returns, int N,
-                         tMPI_Affinity_strategy aff_strategy,
-                         int *argc, char ***argv,
-                         void (*start_fn)(void*), void *start_arg,
-                         int (*start_fn_main)(int, char**))
int tMPI_Start_threads(tmpi_bool main_returns, int N,
+                        tMPI_Affinity_strategy aff_strategy,
+                        int *argc, char ***argv,
+                        void (*start_fn)(void*), void *start_arg,
+                        int (*start_fn_main)(int, char**))
  {
+     int ret;
  #ifdef TMPI_TRACE
      tMPI_Trace_print("tMPI_Start_threads(%d, %d, %d, %d, %d, %p, %p, %p, %p)",
                       main_returns, N, aff_strategy, argc, argv, start_fn,
          /* allocate global data */
          tmpi_global = (struct tmpi_global*)
              tMPI_Malloc(sizeof(struct tmpi_global));
-         tMPI_Global_init(tmpi_global, N);
+         if (tmpi_global == 0)
+         {
+             return TMPI_ERR_NO_MEM;
+         }
+         ret = tMPI_Global_init(tmpi_global, N);
+         if (ret != TMPI_SUCCESS)
+         {
+             return ret;
+         }
  
          /* allocate world and thread data */
-         threads          = (struct tmpi_thread*)tMPI_Malloc(sizeof(struct tmpi_thread)*N);
-         TMPI_COMM_WORLD  = tMPI_Comm_alloc(NULL, N);
+         threads = (struct tmpi_thread*)
+             tMPI_Malloc(sizeof(struct tmpi_thread)*N);
+         if (threads == NULL)
+         {
+             return TMPI_ERR_NO_MEM;
+         }
+         ret = tMPI_Comm_alloc(&TMPI_COMM_WORLD, NULL, N);
+         if (ret != TMPI_SUCCESS)
+         {
+             return ret;
+         }
          TMPI_GROUP_EMPTY = tMPI_Group_alloc();
  
          if (tMPI_Thread_key_create(&id_key, NULL))
          {
-             tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
+             return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
          }
          for (i = 0; i < N; i++)
          {
  
          for (i = 1; i < N; i++) /* zero is the main thread */
          {
-             int ret;
              ret = tMPI_Thread_create(&(threads[i].thread_id),
                                       tMPI_Thread_starter,
                                       (void*)&(threads[i]) );
              {
                  tMPI_Thread_setaffinity_single(threads[i].thread_id, i);
              }
-             if (ret)
+             if (ret != TMPI_SUCCESS)
              {
-                 tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
+                 return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
              }
          }
          /* the main thread also runs start_fn if we don't want
          if (!main_returns)
          {
              tMPI_Thread_starter((void*)&(threads[0]));
          }
          else
          {
-             tMPI_Thread_init(&(threads[0]));
+             ret = tMPI_Thread_init(&(threads[0]));
+             if (ret != 0)
+             {
+                 return ret;
+             }
          }
      }
+     return TMPI_SUCCESS;
  }
  
  
  int tMPI_Init(int *argc, char ***argv,
                int (*start_function)(int, char**))
  {
+     int ret;
  #ifdef TMPI_TRACE
      tMPI_Trace_print("tMPI_Init(%p, %p, %p)", argc, argv, start_function);
  #endif
      {
          int N = 0;
          tMPI_Get_N(argc, argv, "-nt", &N);
-         tMPI_Start_threads(TRUE, N, TMPI_AFFINITY_ALL_CORES, argc, argv,
-                            NULL, NULL, start_function);
+         ret = tMPI_Start_threads(TRUE, N, TMPI_AFFINITY_ALL_CORES, argc, argv,
+                                  NULL, NULL, start_function) != 0;
+         if (ret != 0)
+         {
+             return ret;
+         }
      }
      else
      {
@@@ -516,6 -574,7 +574,7 @@@ int tMPI_Init_fn(int main_thread_return
                   tMPI_Affinity_strategy aff_strategy,
                   void (*start_function)(void*), void *arg)
  {
+     int ret;
  #ifdef TMPI_TRACE
      tMPI_Trace_print("tMPI_Init_fn(%d, %p, %p)", N, start_function, arg);
  #endif
  
      if (TMPI_COMM_WORLD == 0 && N >= 1) /* we're the main process */
      {
-         tMPI_Start_threads(main_thread_returns, N, aff_strategy,
-                            0, 0, start_function, arg, NULL);
+         ret = tMPI_Start_threads(main_thread_returns, N, aff_strategy,
+                                  0, 0, start_function, arg, NULL);
+         if (ret != 0)
+         {
+             return ret;
+         }
      }
      return TMPI_SUCCESS;
  }
@@@ -551,6 -614,7 +614,7 @@@ int tMPI_Initialized(int *flag
  int tMPI_Finalize(void)
  {
      int i;
+     int ret;
  #ifdef TMPI_TRACE
      tMPI_Trace_print("tMPI_Finalize()");
  #endif
          struct tmpi_thread *cur = tMPI_Get_current();
  
          tMPI_Profile_stop( &(cur->profile) );
-         tMPI_Thread_barrier_wait( &(tmpi_global->barrier) );
+         ret = tMPI_Thread_barrier_wait( &(tmpi_global->barrier) );
+         if (ret != 0)
+         {
+             return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+         }
  
          if (tMPI_Is_master())
          {
          }
      }
  #endif
-     tMPI_Thread_barrier_wait( &(tmpi_global->barrier) );
+     ret = tMPI_Thread_barrier_wait( &(tmpi_global->barrier) );
+     if (ret != 0)
+     {
+         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+     }
  
      if (tMPI_Is_master())
      {
          {
              if (tMPI_Thread_join(threads[i].thread_id, NULL))
              {
-                 tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_FINALIZE);
+                 return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_FINALIZE);
              }
              tMPI_Thread_destroy(&(threads[i]));
          }
          {
              tMPI_Comm cur;
  
-             tMPI_Thread_mutex_lock(&(tmpi_global->comm_link_lock));
+             ret = tMPI_Thread_mutex_lock(&(tmpi_global->comm_link_lock));
+             if (ret != 0)
+             {
+                 return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+             }
              cur = TMPI_COMM_WORLD->next;
              while (cur && (cur != TMPI_COMM_WORLD) )
              {
                  tMPI_Comm next = cur->next;
-                 tMPI_Comm_destroy(cur, FALSE);
+                 ret = tMPI_Comm_destroy(cur, FALSE);
+                 if (ret != 0)
+                 {
+                     tMPI_Thread_mutex_unlock(&(tmpi_global->comm_link_lock));
+                     return ret;
+                 }
                  cur = next;
              }
-             tMPI_Comm_destroy(TMPI_COMM_WORLD, FALSE);
-             tMPI_Thread_mutex_unlock(&(tmpi_global->comm_link_lock));
+             ret = tMPI_Comm_destroy(TMPI_COMM_WORLD, FALSE);
+             if (ret != 0)
+             {
+                 tMPI_Thread_mutex_unlock(&(tmpi_global->comm_link_lock));
+                 return ret;
+             }
+             ret = tMPI_Thread_mutex_unlock(&(tmpi_global->comm_link_lock));
+             if (ret != 0)
+             {
+                 return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_IO);
+             }
          }
  
          tMPI_Group_free(&TMPI_GROUP_EMPTY);
@@@ -670,7 -763,8 +763,8 @@@ int tMPI_Abort(tMPI_Comm comm, int erro
          }
          else
          {
-             fprintf(stderr, "tMPI_Abort called on main thread with errorcode=%d\n",
+             fprintf(stderr,
+                     "tMPI_Abort called on main thread with errorcode=%d\n",
                      errorcode);
          }
          fflush(stderr);
@@@ -805,12 -899,6 +899,6 @@@ double tMPI_Wtick(void
  #endif
  }
  
  int tMPI_Get_count(tMPI_Status *status, tMPI_Datatype datatype, int *count)
  {
  #ifdef TMPI_TRACE
index cc1a2c2c21a35bba2ab6724d772a8d3f089806c1,df105f0a31b9e6f983df33d6654da9a91f4cf590..df105f0a31b9e6f983df33d6654da9a91f4cf590
@@@ -66,7 -66,7 +66,7 @@@ void *tMPI_Malloc(size_t size
  
      if (!ret)
      {
-         tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_MALLOC);
+         tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_NO_MEM);
      }
      return ret;
  }
@@@ -76,7 -76,7 +76,7 @@@ void *tMPI_Realloc(void *p, size_t size
      void *ret = (void*)realloc(p, size);
      if (!ret)
      {
-         tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_MALLOC);
+         tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_NO_MEM);
      }
      return ret;
  }
index c944a434444c23ff01f8e2545b5546ba3eb179b6,f78dd73553c3a340f69490c0a00f4a012234d527..f78dd73553c3a340f69490c0a00f4a012234d527
@@@ -357,11 -357,10 +357,10 @@@ int tMPI_Init_NUMA(void
  
      /* allocate array of processor info blocks */
  
-     pMPI_ProcessorInfo = tMPI_Malloc( sizeof(MPI_NUMA_PROCESSOR_INFO) *
-                                       dwTotalProcessors );
+     pMPI_ProcessorInfo = malloc( sizeof(MPI_NUMA_PROCESSOR_INFO) *
+                                  dwTotalProcessors );
      if (pMPI_ProcessorInfo == NULL)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "tMPI_Malloc failed for processor information");
          goto cleanup;
      }
  
  
              if (!func_GetNumaProcessorNodeEx(pProcessorNumber, pNodeNumber))
              {
-                 tMPI_Fatal_error(TMPI_FARGS,
-                                  "Processor enumeration, GetNumaProcessorNodeEx failed, error code=%d",
-                                  GetLastError());
                  goto cleanup;
              }
  
              if (!func_GetNumaNodeProcessorMaskEx(*pNodeNumber, pGroupAffinity))
              {
-                 tMPI_Fatal_error(TMPI_FARGS,
-                                  "Processor enumeration, GetNumaNodeProcessorMaskEx failed, error code=%d",
-                                  GetLastError());
                  goto cleanup;
              }
  
  
              if (i > dwTotalProcessors)
              {
-                 tMPI_Fatal_error(TMPI_FARGS, "Processor enumeration exceeds allocated memory!");
                  goto cleanup;
              }
          }
@@@ -459,23 -451,29 +451,29 @@@ cleanup
      return 0;
  }
  
- static void tMPI_Thread_id_list_init(void)
+ static int tMPI_Thread_id_list_init(void)
  {
+     int ret = 0;
      EnterCriticalSection( &thread_id_list_lock );
  
      N_thread_id_list      = 0;
      Nalloc_thread_id_list = 4; /* number of initial allocation*/
-     thread_id_list        = (thread_id_list_t*)tMPI_Malloc(
-                 sizeof(thread_id_list_t)*
-                 Nalloc_thread_id_list);
+     thread_id_list        = (thread_id_list_t*)malloc(sizeof(thread_id_list_t)*
+                                                       Nalloc_thread_id_list);
+     if (thread_id_list == NULL)
+     {
+         ret = ENOMEM;
+     }
  
      LeaveCriticalSection( &thread_id_list_lock );
+     return ret;
  }
  
  
  /* add an entry to the thread ID list, assuming it's locked */
- static void tMPI_Thread_id_list_add_locked(DWORD               thread_id,
-                                            struct tMPI_Thread *th)
+ static int tMPI_Thread_id_list_add_locked(DWORD               thread_id,
+                                           struct tMPI_Thread *th)
  {
      if (Nalloc_thread_id_list < N_thread_id_list + 1)
      {
  
          /* double the size */
          Nalloc_thread_id_list *= 2;
-         new_list               = (thread_id_list_t*)tMPI_Malloc(
-                     sizeof(thread_id_list_t)*
-                     Nalloc_thread_id_list);
+         /* and allocate the new list */
+         new_list = (thread_id_list_t*)malloc(sizeof(thread_id_list_t)*
+                                              Nalloc_thread_id_list);
+         if (new_list == NULL)
+         {
+             return ENOMEM;
+         }
          /* and copy over all elements */
          for (i = 0; i < N_thread_id_list; i++)
          {
      thread_id_list[ N_thread_id_list ].th        = th;
      N_thread_id_list++;
  
+     return 0;
  }
  
  
  /* add an entry to the thread ID list */
- static void tMPI_Thread_id_list_add(DWORD thread_id, struct tMPI_Thread *th)
+ static int tMPI_Thread_id_list_add(DWORD thread_id, struct tMPI_Thread *th)
  {
+     int ret = 0;
      EnterCriticalSection( &thread_id_list_lock );
-     tMPI_Thread_id_list_add_locked(thread_id, th);
+     ret = tMPI_Thread_id_list_add_locked(thread_id, th);
      LeaveCriticalSection( &thread_id_list_lock );
+     return ret;
  }
  
- /* Remove an entry from the thread_id list, assuming it's locked */
+ /* Remove an entry from the thread_id list, assuming it's locked.
+    Does nothing if an entry is not found.*/
  static void tMPI_Thread_id_list_remove_locked(DWORD thread_id)
  {
      int       i;
@@@ -575,17 -580,17 +580,17 @@@ static struct tMPI_Thread *tMPI_Thread_
  
      EnterCriticalSection( &thread_id_list_lock );
      ret = tMPI_Thread_id_list_find_locked(thread_id);
      LeaveCriticalSection( &thread_id_list_lock );
      return ret;
  }
  
  /* try to add the running thread to the list. Returns the tMPI_Thrread struct
-    associated with this thread.*/
+    associated with this thread, or NULL in case of an error.*/
  static struct tMPI_Thread *tMPI_Thread_id_list_add_self(void)
  {
      DWORD               thread_id;
      struct tMPI_Thread *th = NULL;
+     int                 ret;
  
      EnterCriticalSection( &thread_id_list_lock );
  
      if (th == NULL)
      {
          /* if not, create an ID, set it and return it */
-         th = (struct tMPI_Thread*)tMPI_Malloc(sizeof(struct tMPI_Thread)*1);
+         th = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
  
          /* to create a handle that can be used outside of the current
             thread, the handle from GetCurrentThread() must first
  
          /* This causes a small memory leak that is hard to fix. */
          th->started_by_tmpi = 0;
-         tMPI_Thread_id_list_add_locked(thread_id, th);
+         ret                 = tMPI_Thread_id_list_add_locked(thread_id, th);
+         if (ret != 0)
+         {
+             free(th);
+             th = NULL;
+         }
      }
      LeaveCriticalSection( &thread_id_list_lock );
      return th;
  }
  
  
- static void tMPI_Init_initers(void)
+ static int tMPI_Init_initers(void)
  {
      int state;
+     int ret = 0;
      /* we can pre-check because it's atomic */
      if (tMPI_Atomic_get(&init_inited) == 0)
      {
              InitializeCriticalSection(&barrier_init);
              InitializeCriticalSection(&thread_id_list_lock);
  
-             /* fatal errors are handled by the routine by calling
-                tMPI_Fatal_error() */
-             tMPI_Init_NUMA();
+             ret = tMPI_Init_NUMA();
+             if (ret != 0)
+             {
+                 goto err;
+             }
  
-             tMPI_Thread_id_list_init();
+             ret = tMPI_Thread_id_list_init();
+             if (ret != 0)
+             {
+                 goto err;
+             }
  
              tMPI_Atomic_memory_barrier_rel();
              tMPI_Atomic_set(&init_inited, 1);
  
          tMPI_Spinlock_unlock( &init_init );
      }
- }
- /* TODO: this needs to go away!  (there's another one in pthreads.c)
-    fatal errors are thankfully really rare*/
- void tMPI_Fatal_error(const char *file, int line, const char *message, ...)
- {
-     va_list ap;
-     fprintf(stderr, "tMPI Fatal error in %s, line %d: ", file, line);
-     va_start(ap, message);
-     vfprintf(stderr, message, ap);
-     va_end(ap);
-     fprintf(stderr, "\n");
-     abort();
+     return ret;
+ err:
+     tMPI_Spinlock_unlock( &init_init );
+     return ret;
  }
  
  
@@@ -709,23 -715,37 +715,37 @@@ int tMPI_Thread_create(tMPI_Thread_t *t
  {
      DWORD thread_id;
      struct tMPI_Thread_starter_param *prm;
+     int   ret;
+     ret = tMPI_Init_initers();
+     if (ret != 0)
+     {
+         return ret;
+     }
  
-     tMPI_Init_initers();
+     if (thread == NULL)
+     {
+         return EINVAL;
+     }
  
      /* a small memory leak to be sure that it doesn't get deallocated
         once this function ends, before the newly created thread uses it. */
      prm = (struct tMPI_Thread_starter_param*)
-         tMPI_Malloc(sizeof(struct tMPI_Thread_starter_param));
+         malloc(sizeof(struct tMPI_Thread_starter_param));
+     if (prm == NULL)
+     {
+         return ENOMEM;
+     }
      prm->start_routine = start_routine;
      prm->param         = arg;
  
-     *thread = (struct tMPI_Thread*)tMPI_Malloc(sizeof(struct tMPI_Thread)*1);
-     if (thread == NULL)
+     *thread = (struct tMPI_Thread*)malloc(sizeof(struct tMPI_Thread)*1);
+     if (*thread == NULL)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Invalid thread pointer.");
-         return EINVAL;
+         free(prm);
+         return ENOMEM;
      }
      /* this must be locked before the thread is created to prevent a race
         condition if the thread immediately wants to create its own entry */
      EnterCriticalSection( &thread_id_list_lock );
                                                prm,
                                                0,
                                                &thread_id);
+     if ((*thread)->th == NULL)
+     {
+         ret = -1;
+         goto err;
+     }
      (*thread)->id = thread_id;
  
      if ((*thread)->th == NULL)
      {
-         tMPI_Free(thread);
-         tMPI_Fatal_error(TMPI_FARGS, "Failed to create thread, error code=%d",
-                          GetLastError());
-         return -1;
+         ret = -1;
+         goto err;
+     }
+     ret = tMPI_Thread_id_list_add_locked(thread_id, (*thread));
+     if (ret != 0)
+     {
+         goto err;
      }
-     tMPI_Thread_id_list_add_locked(thread_id, (*thread));
      LeaveCriticalSection( &thread_id_list_lock );
  
+ #if 0
      /* inherit the thread priority from the parent thread. */
      /* TODO: is there value in setting this, vs. just allowing it to default
         from the process?  currently, this limits the effectivenes of changing
         the priority in eg: TaskManager. */
      SetThreadPriority(((*thread)->th), GetThreadPriority(GetCurrentThread()));
+ #endif
  
      return 0;
+ err:
+     free(prm);
+     free(thread);
+     LeaveCriticalSection( &thread_id_list_lock );
+     return ret;
  }
  
  
@@@ -769,11 -803,8 +803,8 @@@ int tMPI_Thread_join(tMPI_Thread_t thre
      DWORD ret, retval;
  
      ret = WaitForSingleObject(thread->th, INFINITE);
      if (ret != 0)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Failed to join thread. error code=%d",
-                          GetLastError());
          return -1;
      }
  
      {
          if (!GetExitCodeThread(thread, &retval))
          {
-             /* TODO: somehow assign value_ptr */
-             tMPI_Fatal_error(TMPI_FARGS,
-                              "Failed to get thread exit code: error=%d",
-                              GetLastError());
              return -1;
          }
      }
      CloseHandle(thread->th);
      tMPI_Thread_id_list_remove(thread->id);
-     tMPI_Free(thread);
+     free(thread);
  
      return 0;
  }
  
  void tMPI_Thread_exit(void *value_ptr)
  {
-     /* TODO: fix exit code */
      /* TODO: call destructors for thread-local storage */
      ExitThread( 0 );
  }
@@@ -810,8 -836,6 +836,6 @@@ int tMPI_Thread_cancel(tMPI_Thread_t th
  {
      if (!TerminateThread( thread, -1) )
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Failed thread_cancel, error code=%d",
-                          GetLastError());
          return -1;
      }
      tMPI_Thread_id_list_remove(thread->id);
  tMPI_Thread_t tMPI_Thread_self(void)
  {
      tMPI_Thread_t th;
-     tMPI_Init_initers();
+     int           ret;
  
-     th = tMPI_Thread_id_list_add_self();
+     ret = tMPI_Init_initers();
+     if (ret != 0)
+     {
+         return NULL;
+     }
  
+     th = tMPI_Thread_id_list_add_self();
      return th;
  }
  
@@@ -914,7 -943,11 +943,11 @@@ int tMPI_Thread_mutex_init(tMPI_Thread_
          return EINVAL;
      }
  
-     mtx->mutex = (struct tMPI_Mutex*)tMPI_Malloc(sizeof(struct tMPI_Mutex)*1);
+     mtx->mutex = (struct tMPI_Mutex*)malloc(sizeof(struct tMPI_Mutex)*1);
+     if (mtx->mutex == NULL)
+     {
+         return ENOMEM;
+     }
      InitializeCriticalSection(&(mtx->mutex->cs));
  
      return 0;
@@@ -929,7 -962,7 +962,7 @@@ int tMPI_Thread_mutex_destroy(tMPI_Thre
      }
  
      DeleteCriticalSection(&(mtx->mutex->cs));
-     tMPI_Free(mtx->mutex);
+     free(mtx->mutex);
  
      return 0;
  }
@@@ -949,7 -982,11 +982,11 @@@ static int tMPI_Thread_mutex_init_once(
       */
  
      /* initialize the initializers */
-     tMPI_Init_initers();
+     ret = tMPI_Init_initers();
+     if (ret != 0)
+     {
+         return ret;
+     }
      /* Lock the common one-time init mutex so we can check carefully */
      EnterCriticalSection( &mutex_init );
  
@@@ -1017,22 -1054,21 +1054,21 @@@ int tMPI_Thread_key_create(tMPI_Thread_
  {
      if (key == NULL)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Invalid key pointer.");
          return EINVAL;
      }
  
  
      /* TODO: make list of destructors for thread-local storage */
-     key->key = (struct tMPI_Thread_key*)tMPI_Malloc(sizeof(struct
-                                                            tMPI_Thread_key)*1);
+     key->key = (struct tMPI_Thread_key*)malloc(sizeof(struct tMPI_Thread_key));
+     if (key->key == NULL)
+     {
+         return ENOMEM;
+     }
  
      (key)->key->wkey = TlsAlloc();
  
      if ( (key)->key->wkey == TLS_OUT_OF_INDEXES)
      {
-         tMPI_Fatal_error(TMPI_FARGS,
-                          "Failed to create thread key, error code=%d.",
-                          GetLastError());
          return -1;
      }
  
  int tMPI_Thread_key_delete(tMPI_Thread_key_t key)
  {
      TlsFree(key.key->wkey);
-     tMPI_Free(key.key);
+     free(key.key);
  
      return 0;
  }
@@@ -1098,12 -1134,18 +1134,18 @@@ int tMPI_Thread_once(tMPI_Thread_once_
  
      if (!bStatus)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Failed to run thread_once routine");
          return -1;
      }
  #else
+     int ret;
      /* really ugly hack - and it's slow... */
-     tMPI_Init_initers();
+     ret = tMPI_Init_initers();
+     if (ret != 0)
+     {
+         return ret;
+     }
      EnterCriticalSection(&once_init);
      if (tMPI_Atomic_get(&(once_control->once)) == 0)
      {
@@@ -1127,7 -1169,11 +1169,11 @@@ int tMPI_Thread_cond_init(tMPI_Thread_c
      }
  
      cond->condp = (struct tMPI_Thread_cond*)
-         tMPI_Malloc(sizeof(struct tMPI_Thread_cond)*1);
+         malloc(sizeof(struct tMPI_Thread_cond));
+     if (cond->condp == NULL)
+     {
+         return ENOMEM;
+     }
  #if 0
      /* use this code once Vista is the minimum version required */
      InitializeConditionVariable( &(cond->cv) );
@@@ -1150,7 -1196,7 +1196,7 @@@ int tMPI_Thread_cond_destroy(tMPI_Threa
      /* windows doesnt have this function */
  #else
      DeleteCriticalSection(&(cond->condp->wtr_lock));
-     tMPI_Free(cond->condp);
+     free(cond->condp);
  #endif
      return 0;
  }
@@@ -1180,7 -1226,11 +1226,11 @@@ static int tMPI_Thread_cond_init_once(t
       */
  
      /* initialize the initializers */
-     tMPI_Init_initers();
+     ret = tMPI_Init_initers();
+     if (ret != 0)
+     {
+         return ret;
+     }
      /* Lock the common one-time init mutex so we can check carefully */
      EnterCriticalSection( &cond_init );
  
@@@ -1204,11 -1254,16 +1254,16 @@@ int tMPI_Thread_cond_wait(tMPI_Thread_c
      BOOL wait_done   = FALSE;
      BOOL last_waiter = FALSE;
      int  my_cycle;
+     int  ret;
  
      /* check whether the condition is initialized */
      if (tMPI_Atomic_get( &(cond->initialized)  ) == 0)
      {
-         tMPI_Thread_cond_init_once(cond);
+         ret = tMPI_Thread_cond_init_once(cond);
+         if (ret != 0)
+         {
+             return ret;
+         }
      }
      /* the mutex must have been initialized because it should be locked here */
  
  
      if (!ret)
      {
-         tMPI_Fatal_error(TMPI_FARGS, "Failed wait for condition, error code=%d",
-                          GetLastError());
          return -1;
      }
  #else
          /* do the actual waiting */
          if (WaitForSingleObject( cond->condp->ev, INFINITE ) == WAIT_FAILED)
          {
-             tMPI_Fatal_error(TMPI_FARGS, "Failed event reset, error code=%d",
-                              GetLastError());
              return -1;
          }
  
      {
          if (!ResetEvent( cond->condp->ev ))
          {
-             tMPI_Fatal_error(TMPI_FARGS, "Failed event reset, error code=%d",
-                              GetLastError());
              return -1;
          }
      }
  
  int tMPI_Thread_cond_signal(tMPI_Thread_cond_t *cond)
  {
+     int ret;
      /* check whether the condition is initialized */
      if (tMPI_Atomic_get( &(cond->initialized)  ) == 0)
      {
-         tMPI_Thread_cond_init_once(cond);
+         ret = tMPI_Thread_cond_init_once(cond);
+         if (ret != 0)
+         {
+             return ret;
+         }
      }
      /* The condition variable is now guaranteed to be valid. */
  #if 0
          if (!SetEvent(cond->condp->ev)) /* actually release the
                                             waiting threads */
          {
-             tMPI_Fatal_error(TMPI_FARGS, "Failed SetEvent, error code=%d",
-                              GetLastError());
              return -1;
          }
      }
  
  int tMPI_Thread_cond_broadcast(tMPI_Thread_cond_t *cond)
  {
+     int ret;
      /* check whether the condition is initialized */
      if (tMPI_Atomic_get( &(cond->initialized)  ) == 0)
      {
-         tMPI_Thread_cond_init_once(cond);
+         ret = tMPI_Thread_cond_init_once(cond);
+         if (ret != 0)
+         {
+             return ret;
+         }
      }
      /* The condition variable is now guaranteed to be valid. */
  #if 0
          if (!SetEvent(cond->condp->ev)) /* actually release the
                                             waiting threads */
          {
-             tMPI_Fatal_error(TMPI_FARGS, "Failed SetEvent, error code=%d",
-                              GetLastError());
              return -1;
          }
      }
  
  int tMPI_Thread_barrier_init(tMPI_Thread_barrier_t *barrier, int n)
  {
+     int ret;
      if (barrier == NULL)
      {
          return EINVAL;
      }
  
      barrier->barrierp = (struct tMPI_Thread_barrier*)
-         tMPI_Malloc(sizeof(struct tMPI_Thread_barrier)*1);
+         malloc(sizeof(struct tMPI_Thread_barrier)*1);
+     if (barrier->barrierp == NULL)
+     {
+         return ENOMEM;
+     }
  
  #if 0
      /* use this once Vista is the oldest supported windows version: */
      InitializeCriticalSection(&(barrier->barrierp->cs));
      InitializeConditionVariable(&(barrier->barrierp->cv));
  #else
-     tMPI_Thread_mutex_init(&(barrier->barrierp->cs));
-     tMPI_Thread_cond_init(&(barrier->barrierp->cv));
+     ret = tMPI_Thread_mutex_init(&(barrier->barrierp->cs));
+     if (ret != 0)
+     {
+         return ret;
+     }
+     ret = tMPI_Thread_cond_init(&(barrier->barrierp->cv));
+     if (ret != 0)
+     {
+         return ret;
+     }
  #endif
  
      barrier->threshold = n;
  
  int tMPI_Thread_barrier_destroy(tMPI_Thread_barrier_t *barrier)
  {
+     int ret;
      if (barrier == NULL)
      {
          return EINVAL;
  #if 0
      DeleteCriticalSection(&(barrier->barrierp->cs));
  #else
-     tMPI_Thread_mutex_destroy(&(barrier->barrierp->cs));
+     ret = tMPI_Thread_mutex_destroy(&(barrier->barrierp->cs));
+     if (ret != 0)
+     {
+         return ret;
+     }
  #endif
  
-     tMPI_Thread_cond_destroy(&(barrier->barrierp->cv));
+     ret = tMPI_Thread_cond_destroy(&(barrier->barrierp->cv));
+     if (ret != 0)
+     {
+         return ret;
+     }
  
-     tMPI_Free(barrier->barrierp);
+     free(barrier->barrierp);
  
      return 0;
  }
@@@ -1424,7 -1504,11 +1504,11 @@@ static int tMPI_Thread_barrier_init_onc
  
  
      /* initialize the initializers */
-     tMPI_Init_initers();
+     ret = tMPI_Init_initers();
+     if (ret != 0)
+     {
+         return ret;
+     }
  
      /* Lock the common one-time init mutex so we can check carefully */
      EnterCriticalSection( &barrier_init );
  
  int tMPI_Thread_barrier_wait(tMPI_Thread_barrier_t *barrier)
  {
-     int     cycle;
-     BOOL    rc  = FALSE;
-     int     ret = 0;
+     int  cycle;
+     BOOL rc  = FALSE;
+     int  ret = 0;
      /*tMPI_Thread_pthread_barrier_t *p;*/
  
      /* check whether the barrier is initialized */
      if (tMPI_Atomic_get( &(barrier->initialized)  ) == 0)
      {
-         tMPI_Thread_barrier_init_once(barrier, barrier->threshold);
+         ret = tMPI_Thread_barrier_init_once(barrier, barrier->threshold);
+         if (ret != 0)
+         {
+             return ret;
+         }
      }
  #if 0
      EnterCriticalSection( &(barrier->barrierp->cs)  );
  #else
-     tMPI_Thread_mutex_lock( &(barrier->barrierp->cs) );
+     ret = tMPI_Thread_mutex_lock( &(barrier->barrierp->cs) );
+     if (ret != 0)
+     {
+         return ret;
+     }
  #endif
  
  
  #if 0
          WakeAllConditionVariable( &(barrier->barrierp->cv) );
  #else
-         tMPI_Thread_cond_broadcast( &(barrier->barrierp->cv) );
+         ret = tMPI_Thread_cond_broadcast( &(barrier->barrierp->cv) );
+         if (ret != 0)
+         {
+             return ret;
+         }
  #endif
      }
      else
index 7d283f7df732a2a19f153528d089b45c1b0df15d,0000000000000000000000000000000000000000..9cf04d86def7e006086eeacc5627dcce29f3a3b7
mode 100644,000000..100644
--- /dev/null
@@@ -1,3516 -1,0 +1,3515 @@@
-     { 54, F_DHDL_CON          },
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +/* This file is completely threadsafe - keep it that way! */
 +#ifdef GMX_THREAD_MPI
 +#include <thread_mpi.h>
 +#endif
 +
 +
 +#include <ctype.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "string2.h"
 +#include "gmx_fatal.h"
 +#include "macros.h"
 +#include "names.h"
 +#include "symtab.h"
 +#include "futil.h"
 +#include "filenm.h"
 +#include "gmxfio.h"
 +#include "topsort.h"
 +#include "tpxio.h"
 +#include "txtdump.h"
 +#include "confio.h"
 +#include "atomprop.h"
 +#include "copyrite.h"
 +#include "vec.h"
 +#include "mtop_util.h"
 +
 +#define TPX_TAG_RELEASE  "release"
 +
 +/* This is the tag string which is stored in the tpx file.
 + * Change this if you want to change the tpx format in a feature branch.
 + * This ensures that there will not be different tpx formats around which
 + * can not be distinguished.
 + */
 +static const char *tpx_tag = TPX_TAG_RELEASE;
 +
 +/* This number should be increased whenever the file format changes! */
 +static const int tpx_version = 92;
 +
 +/* This number should only be increased when you edit the TOPOLOGY section
 + * or the HEADER of the tpx format.
 + * This way we can maintain forward compatibility too for all analysis tools
 + * and/or external programs that only need to know the atom/residue names,
 + * charges, and bond connectivity.
 + *
 + * It first appeared in tpx version 26, when I also moved the inputrecord
 + * to the end of the tpx file, so we can just skip it if we only
 + * want the topology.
 + */
 +static const int tpx_generation = 25;
 +
 +/* This number should be the most recent backwards incompatible version
 + * I.e., if this number is 9, we cannot read tpx version 9 with this code.
 + */
 +static const int tpx_incompatible_version = 9;
 +
 +
 +
 +/* Struct used to maintain tpx compatibility when function types are added */
 +typedef struct {
 +    int fvnr;  /* file version number in which the function type first appeared */
 +    int ftype; /* function type */
 +} t_ftupd;
 +
 +/*
 + * The entries should be ordered in:
 + * 1. ascending file version number
 + * 2. ascending function type number
 + */
 +/*static const t_ftupd ftupd[] = {
 +   { 20, F_CUBICBONDS        },
 +   { 20, F_CONNBONDS         },
 +   { 20, F_HARMONIC          },
 +   { 20, F_EQM,              },
 +   { 22, F_DISRESVIOL        },
 +   { 22, F_ORIRES            },
 +   { 22, F_ORIRESDEV         },
 +   { 26, F_FOURDIHS          },
 +   { 26, F_PIDIHS            },
 +   { 26, F_DIHRES            },
 +   { 26, F_DIHRESVIOL        },
 +   { 30, F_CROSS_BOND_BONDS  },
 +   { 30, F_CROSS_BOND_ANGLES },
 +   { 30, F_UREY_BRADLEY      },
 +   { 30, F_POLARIZATION      },
 +   { 54, F_DHDL_CON          },
 +   };*/
 +/*
 + * The entries should be ordered in:
 + * 1. ascending function type number
 + * 2. ascending file version number
 + */
 +/* question; what is the purpose of the commented code above? */
 +static const t_ftupd ftupd[] = {
 +    { 20, F_CUBICBONDS        },
 +    { 20, F_CONNBONDS         },
 +    { 20, F_HARMONIC          },
 +    { 34, F_FENEBONDS         },
 +    { 43, F_TABBONDS          },
 +    { 43, F_TABBONDSNC        },
 +    { 70, F_RESTRBONDS        },
 +    { 76, F_LINEAR_ANGLES     },
 +    { 30, F_CROSS_BOND_BONDS  },
 +    { 30, F_CROSS_BOND_ANGLES },
 +    { 30, F_UREY_BRADLEY      },
 +    { 34, F_QUARTIC_ANGLES    },
 +    { 43, F_TABANGLES         },
 +    { 26, F_FOURDIHS          },
 +    { 26, F_PIDIHS            },
 +    { 43, F_TABDIHS           },
 +    { 65, F_CMAP              },
 +    { 60, F_GB12              },
 +    { 61, F_GB13              },
 +    { 61, F_GB14              },
 +    { 72, F_GBPOL             },
 +    { 72, F_NPSOLVATION       },
 +    { 41, F_LJC14_Q           },
 +    { 41, F_LJC_PAIRS_NB      },
 +    { 32, F_BHAM_LR           },
 +    { 32, F_RF_EXCL           },
 +    { 32, F_COUL_RECIP        },
 +    { 46, F_DPD               },
 +    { 30, F_POLARIZATION      },
 +    { 36, F_THOLE_POL         },
 +    { 80, F_FBPOSRES          },
 +    { 22, F_DISRESVIOL        },
 +    { 22, F_ORIRES            },
 +    { 22, F_ORIRESDEV         },
 +    { 26, F_DIHRES            },
 +    { 26, F_DIHRESVIOL        },
 +    { 49, F_VSITE4FDN         },
 +    { 50, F_VSITEN            },
 +    { 46, F_COM_PULL          },
 +    { 20, F_EQM               },
 +    { 46, F_ECONSERVED        },
 +    { 69, F_VTEMP_NOLONGERUSED},
 +    { 66, F_PDISPCORR         },
-     { 54, F_DHDL_CON          }
++    { 54, F_DVDL_CONSTR       },
 +    { 76, F_ANHARM_POL        },
 +    { 79, F_DVDL_COUL         },
 +    { 79, F_DVDL_VDW,         },
 +    { 79, F_DVDL_BONDED,      },
 +    { 79, F_DVDL_RESTRAINT    },
 +    { 79, F_DVDL_TEMPERATURE  },
 +};
 +#define NFTUPD asize(ftupd)
 +
 +/* Needed for backward compatibility */
 +#define MAXNODES 256
 +
 +static void _do_section(t_fileio *fio, int key, gmx_bool bRead, const char *src,
 +                        int line)
 +{
 +    char     buf[STRLEN];
 +    gmx_bool bDbg;
 +
 +    if (gmx_fio_getftp(fio) == efTPA)
 +    {
 +        if (!bRead)
 +        {
 +            gmx_fio_write_string(fio, itemstr[key]);
 +            bDbg       = gmx_fio_getdebug(fio);
 +            gmx_fio_setdebug(fio, FALSE);
 +            gmx_fio_write_string(fio, comment_str[key]);
 +            gmx_fio_setdebug(fio, bDbg);
 +        }
 +        else
 +        {
 +            if (gmx_fio_getdebug(fio))
 +            {
 +                fprintf(stderr, "Looking for section %s (%s, %d)",
 +                        itemstr[key], src, line);
 +            }
 +
 +            do
 +            {
 +                gmx_fio_do_string(fio, buf);
 +            }
 +            while ((gmx_strcasecmp(buf, itemstr[key]) != 0));
 +
 +            if (gmx_strcasecmp(buf, itemstr[key]) != 0)
 +            {
 +                gmx_fatal(FARGS, "\nCould not find section heading %s", itemstr[key]);
 +            }
 +            else if (gmx_fio_getdebug(fio))
 +            {
 +                fprintf(stderr, " and found it\n");
 +            }
 +        }
 +    }
 +}
 +
 +#define do_section(fio, key, bRead) _do_section(fio, key, bRead, __FILE__, __LINE__)
 +
 +/**************************************************************
 + *
 + * Now the higer level routines that do io of the structures and arrays
 + *
 + **************************************************************/
 +static void do_pullgrp(t_fileio *fio, t_pullgrp *pgrp, gmx_bool bRead,
 +                       int file_version)
 +{
 +    gmx_bool bDum = TRUE;
 +    int      i;
 +
 +    gmx_fio_do_int(fio, pgrp->nat);
 +    if (bRead)
 +    {
 +        snew(pgrp->ind, pgrp->nat);
 +    }
 +    bDum = gmx_fio_ndo_int(fio, pgrp->ind, pgrp->nat);
 +    gmx_fio_do_int(fio, pgrp->nweight);
 +    if (bRead)
 +    {
 +        snew(pgrp->weight, pgrp->nweight);
 +    }
 +    bDum = gmx_fio_ndo_real(fio, pgrp->weight, pgrp->nweight);
 +    gmx_fio_do_int(fio, pgrp->pbcatom);
 +    gmx_fio_do_rvec(fio, pgrp->vec);
 +    gmx_fio_do_rvec(fio, pgrp->init);
 +    gmx_fio_do_real(fio, pgrp->rate);
 +    gmx_fio_do_real(fio, pgrp->k);
 +    if (file_version >= 56)
 +    {
 +        gmx_fio_do_real(fio, pgrp->kB);
 +    }
 +    else
 +    {
 +        pgrp->kB = pgrp->k;
 +    }
 +}
 +
 +static void do_expandedvals(t_fileio *fio, t_expanded *expand, t_lambda *fepvals, gmx_bool bRead, int file_version)
 +{
 +    /* i is used in the ndo_double macro*/
 +    int      i;
 +    real     fv;
 +    gmx_bool bDum = TRUE;
 +    real     rdum;
 +    int      n_lambda = fepvals->n_lambda;
 +
 +    /* reset the lambda calculation window */
 +    fepvals->lambda_start_n = 0;
 +    fepvals->lambda_stop_n  = n_lambda;
 +    if (file_version >= 79)
 +    {
 +        if (n_lambda > 0)
 +        {
 +            if (bRead)
 +            {
 +                snew(expand->init_lambda_weights, n_lambda);
 +            }
 +            bDum = gmx_fio_ndo_real(fio, expand->init_lambda_weights, n_lambda);
 +            gmx_fio_do_gmx_bool(fio, expand->bInit_weights);
 +        }
 +
 +        gmx_fio_do_int(fio, expand->nstexpanded);
 +        gmx_fio_do_int(fio, expand->elmcmove);
 +        gmx_fio_do_int(fio, expand->elamstats);
 +        gmx_fio_do_int(fio, expand->lmc_repeats);
 +        gmx_fio_do_int(fio, expand->gibbsdeltalam);
 +        gmx_fio_do_int(fio, expand->lmc_forced_nstart);
 +        gmx_fio_do_int(fio, expand->lmc_seed);
 +        gmx_fio_do_real(fio, expand->mc_temp);
 +        gmx_fio_do_int(fio, expand->bSymmetrizedTMatrix);
 +        gmx_fio_do_int(fio, expand->nstTij);
 +        gmx_fio_do_int(fio, expand->minvarmin);
 +        gmx_fio_do_int(fio, expand->c_range);
 +        gmx_fio_do_real(fio, expand->wl_scale);
 +        gmx_fio_do_real(fio, expand->wl_ratio);
 +        gmx_fio_do_real(fio, expand->init_wl_delta);
 +        gmx_fio_do_gmx_bool(fio, expand->bWLoneovert);
 +        gmx_fio_do_int(fio, expand->elmceq);
 +        gmx_fio_do_int(fio, expand->equil_steps);
 +        gmx_fio_do_int(fio, expand->equil_samples);
 +        gmx_fio_do_int(fio, expand->equil_n_at_lam);
 +        gmx_fio_do_real(fio, expand->equil_wl_delta);
 +        gmx_fio_do_real(fio, expand->equil_ratio);
 +    }
 +}
 +
 +static void do_simtempvals(t_fileio *fio, t_simtemp *simtemp, int n_lambda, gmx_bool bRead,
 +                           int file_version)
 +{
 +    gmx_bool bDum = TRUE;
 +
 +    if (file_version >= 79)
 +    {
 +        gmx_fio_do_int(fio, simtemp->eSimTempScale);
 +        gmx_fio_do_real(fio, simtemp->simtemp_high);
 +        gmx_fio_do_real(fio, simtemp->simtemp_low);
 +        if (n_lambda > 0)
 +        {
 +            if (bRead)
 +            {
 +                snew(simtemp->temperatures, n_lambda);
 +            }
 +            bDum = gmx_fio_ndo_real(fio, simtemp->temperatures, n_lambda);
 +        }
 +    }
 +}
 +
 +static void do_fepvals(t_fileio *fio, t_lambda *fepvals, gmx_bool bRead, int file_version)
 +{
 +    /* i is defined in the ndo_double macro; use g to iterate. */
 +    int      i, g;
 +    real     fv;
 +    gmx_bool bDum = TRUE;
 +    real     rdum;
 +
 +    /* free energy values */
 +
 +    if (file_version >= 79)
 +    {
 +        gmx_fio_do_int(fio, fepvals->init_fep_state);
 +        gmx_fio_do_double(fio, fepvals->init_lambda);
 +        gmx_fio_do_double(fio, fepvals->delta_lambda);
 +    }
 +    else if (file_version >= 59)
 +    {
 +        gmx_fio_do_double(fio, fepvals->init_lambda);
 +        gmx_fio_do_double(fio, fepvals->delta_lambda);
 +    }
 +    else
 +    {
 +        gmx_fio_do_real(fio, rdum);
 +        fepvals->init_lambda = rdum;
 +        gmx_fio_do_real(fio, rdum);
 +        fepvals->delta_lambda = rdum;
 +    }
 +    if (file_version >= 79)
 +    {
 +        gmx_fio_do_int(fio, fepvals->n_lambda);
 +        if (bRead)
 +        {
 +            snew(fepvals->all_lambda, efptNR);
 +        }
 +        for (g = 0; g < efptNR; g++)
 +        {
 +            if (fepvals->n_lambda > 0)
 +            {
 +                if (bRead)
 +                {
 +                    snew(fepvals->all_lambda[g], fepvals->n_lambda);
 +                }
 +                bDum = gmx_fio_ndo_double(fio, fepvals->all_lambda[g], fepvals->n_lambda);
 +                bDum = gmx_fio_ndo_int(fio, fepvals->separate_dvdl, efptNR);
 +            }
 +            else if (fepvals->init_lambda >= 0)
 +            {
 +                fepvals->separate_dvdl[efptFEP] = TRUE;
 +            }
 +        }
 +    }
 +    else if (file_version >= 64)
 +    {
 +        gmx_fio_do_int(fio, fepvals->n_lambda);
 +        if (bRead)
 +        {
 +            int g;
 +
 +            snew(fepvals->all_lambda, efptNR);
 +            /* still allocate the all_lambda array's contents. */
 +            for (g = 0; g < efptNR; g++)
 +            {
 +                if (fepvals->n_lambda > 0)
 +                {
 +                    snew(fepvals->all_lambda[g], fepvals->n_lambda);
 +                }
 +            }
 +        }
 +        bDum = gmx_fio_ndo_double(fio, fepvals->all_lambda[efptFEP],
 +                                  fepvals->n_lambda);
 +        if (fepvals->init_lambda >= 0)
 +        {
 +            int g, h;
 +
 +            fepvals->separate_dvdl[efptFEP] = TRUE;
 +
 +            if (bRead)
 +            {
 +                /* copy the contents of the efptFEP lambda component to all
 +                   the other components */
 +                for (g = 0; g < efptNR; g++)
 +                {
 +                    for (h = 0; h < fepvals->n_lambda; h++)
 +                    {
 +                        if (g != efptFEP)
 +                        {
 +                            fepvals->all_lambda[g][h] =
 +                                fepvals->all_lambda[efptFEP][h];
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        fepvals->n_lambda     = 0;
 +        fepvals->all_lambda   = NULL;
 +        if (fepvals->init_lambda >= 0)
 +        {
 +            fepvals->separate_dvdl[efptFEP] = TRUE;
 +        }
 +    }
 +    if (file_version >= 13)
 +    {
 +        gmx_fio_do_real(fio, fepvals->sc_alpha);
 +    }
 +    else
 +    {
 +        fepvals->sc_alpha = 0;
 +    }
 +    if (file_version >= 38)
 +    {
 +        gmx_fio_do_int(fio, fepvals->sc_power);
 +    }
 +    else
 +    {
 +        fepvals->sc_power = 2;
 +    }
 +    if (file_version >= 79)
 +    {
 +        gmx_fio_do_real(fio, fepvals->sc_r_power);
 +    }
 +    else
 +    {
 +        fepvals->sc_r_power = 6.0;
 +    }
 +    if (file_version >= 15)
 +    {
 +        gmx_fio_do_real(fio, fepvals->sc_sigma);
 +    }
 +    else
 +    {
 +        fepvals->sc_sigma = 0.3;
 +    }
 +    if (bRead)
 +    {
 +        if (file_version >= 71)
 +        {
 +            fepvals->sc_sigma_min = fepvals->sc_sigma;
 +        }
 +        else
 +        {
 +            fepvals->sc_sigma_min = 0;
 +        }
 +    }
 +    if (file_version >= 79)
 +    {
 +        gmx_fio_do_int(fio, fepvals->bScCoul);
 +    }
 +    else
 +    {
 +        fepvals->bScCoul = TRUE;
 +    }
 +    if (file_version >= 64)
 +    {
 +        gmx_fio_do_int(fio, fepvals->nstdhdl);
 +    }
 +    else
 +    {
 +        fepvals->nstdhdl = 1;
 +    }
 +
 +    if (file_version >= 73)
 +    {
 +        gmx_fio_do_int(fio, fepvals->separate_dhdl_file);
 +        gmx_fio_do_int(fio, fepvals->dhdl_derivatives);
 +    }
 +    else
 +    {
 +        fepvals->separate_dhdl_file = esepdhdlfileYES;
 +        fepvals->dhdl_derivatives   = edhdlderivativesYES;
 +    }
 +    if (file_version >= 71)
 +    {
 +        gmx_fio_do_int(fio, fepvals->dh_hist_size);
 +        gmx_fio_do_double(fio, fepvals->dh_hist_spacing);
 +    }
 +    else
 +    {
 +        fepvals->dh_hist_size    = 0;
 +        fepvals->dh_hist_spacing = 0.1;
 +    }
 +    if (file_version >= 79)
 +    {
 +        gmx_fio_do_int(fio, fepvals->bPrintEnergy);
 +    }
 +    else
 +    {
 +        fepvals->bPrintEnergy = FALSE;
 +    }
 +
 +    /* handle lambda_neighbors */
 +    if ((file_version >= 83 && file_version < 90) || file_version >= 92)
 +    {
 +        gmx_fio_do_int(fio, fepvals->lambda_neighbors);
 +        if ( (fepvals->lambda_neighbors >= 0) && (fepvals->init_fep_state >= 0) &&
 +             (fepvals->init_lambda < 0) )
 +        {
 +            fepvals->lambda_start_n = (fepvals->init_fep_state -
 +                                       fepvals->lambda_neighbors);
 +            fepvals->lambda_stop_n = (fepvals->init_fep_state +
 +                                      fepvals->lambda_neighbors + 1);
 +            if (fepvals->lambda_start_n < 0)
 +            {
 +                fepvals->lambda_start_n = 0;;
 +            }
 +            if (fepvals->lambda_stop_n >= fepvals->n_lambda)
 +            {
 +                fepvals->lambda_stop_n = fepvals->n_lambda;
 +            }
 +        }
 +        else
 +        {
 +            fepvals->lambda_start_n = 0;
 +            fepvals->lambda_stop_n  = fepvals->n_lambda;
 +        }
 +    }
 +    else
 +    {
 +        fepvals->lambda_start_n = 0;
 +        fepvals->lambda_stop_n  = fepvals->n_lambda;
 +    }
 +}
 +
 +static void do_pull(t_fileio *fio, t_pull *pull, gmx_bool bRead, int file_version)
 +{
 +    int g;
 +
 +    gmx_fio_do_int(fio, pull->ngrp);
 +    gmx_fio_do_int(fio, pull->eGeom);
 +    gmx_fio_do_ivec(fio, pull->dim);
 +    gmx_fio_do_real(fio, pull->cyl_r1);
 +    gmx_fio_do_real(fio, pull->cyl_r0);
 +    gmx_fio_do_real(fio, pull->constr_tol);
 +    gmx_fio_do_int(fio, pull->nstxout);
 +    gmx_fio_do_int(fio, pull->nstfout);
 +    if (bRead)
 +    {
 +        snew(pull->grp, pull->ngrp+1);
 +    }
 +    for (g = 0; g < pull->ngrp+1; g++)
 +    {
 +        do_pullgrp(fio, &pull->grp[g], bRead, file_version);
 +    }
 +}
 +
 +
 +static void do_rotgrp(t_fileio *fio, t_rotgrp *rotg, gmx_bool bRead, int file_version)
 +{
 +    gmx_bool bDum = TRUE;
 +    int      i;
 +
 +    gmx_fio_do_int(fio, rotg->eType);
 +    gmx_fio_do_int(fio, rotg->bMassW);
 +    gmx_fio_do_int(fio, rotg->nat);
 +    if (bRead)
 +    {
 +        snew(rotg->ind, rotg->nat);
 +    }
 +    gmx_fio_ndo_int(fio, rotg->ind, rotg->nat);
 +    if (bRead)
 +    {
 +        snew(rotg->x_ref, rotg->nat);
 +    }
 +    gmx_fio_ndo_rvec(fio, rotg->x_ref, rotg->nat);
 +    gmx_fio_do_rvec(fio, rotg->vec);
 +    gmx_fio_do_rvec(fio, rotg->pivot);
 +    gmx_fio_do_real(fio, rotg->rate);
 +    gmx_fio_do_real(fio, rotg->k);
 +    gmx_fio_do_real(fio, rotg->slab_dist);
 +    gmx_fio_do_real(fio, rotg->min_gaussian);
 +    gmx_fio_do_real(fio, rotg->eps);
 +    gmx_fio_do_int(fio, rotg->eFittype);
 +    gmx_fio_do_int(fio, rotg->PotAngle_nstep);
 +    gmx_fio_do_real(fio, rotg->PotAngle_step);
 +}
 +
 +static void do_rot(t_fileio *fio, t_rot *rot, gmx_bool bRead, int file_version)
 +{
 +    int g;
 +
 +    gmx_fio_do_int(fio, rot->ngrp);
 +    gmx_fio_do_int(fio, rot->nstrout);
 +    gmx_fio_do_int(fio, rot->nstsout);
 +    if (bRead)
 +    {
 +        snew(rot->grp, rot->ngrp);
 +    }
 +    for (g = 0; g < rot->ngrp; g++)
 +    {
 +        do_rotgrp(fio, &rot->grp[g], bRead, file_version);
 +    }
 +}
 +
 +
 +static void do_inputrec(t_fileio *fio, t_inputrec *ir, gmx_bool bRead,
 +                        int file_version, real *fudgeQQ)
 +{
 +    int      i, j, k, *tmp, idum = 0;
 +    gmx_bool bDum = TRUE;
 +    real     rdum, bd_temp;
 +    rvec     vdum;
 +    gmx_bool bSimAnn;
 +    real     zerotemptime, finish_t, init_temp, finish_temp;
 +
 +    if (file_version != tpx_version)
 +    {
 +        /* Give a warning about features that are not accessible */
 +        fprintf(stderr, "Note: file tpx version %d, software tpx version %d\n",
 +                file_version, tpx_version);
 +    }
 +
 +    if (bRead)
 +    {
 +        init_inputrec(ir);
 +    }
 +
 +    if (file_version == 0)
 +    {
 +        return;
 +    }
 +
 +    /* Basic inputrec stuff */
 +    gmx_fio_do_int(fio, ir->eI);
 +    if (file_version >= 62)
 +    {
 +        gmx_fio_do_gmx_large_int(fio, ir->nsteps);
 +    }
 +    else
 +    {
 +        gmx_fio_do_int(fio, idum);
 +        ir->nsteps = idum;
 +    }
 +    if (file_version > 25)
 +    {
 +        if (file_version >= 62)
 +        {
 +            gmx_fio_do_gmx_large_int(fio, ir->init_step);
 +        }
 +        else
 +        {
 +            gmx_fio_do_int(fio, idum);
 +            ir->init_step = idum;
 +        }
 +    }
 +    else
 +    {
 +        ir->init_step = 0;
 +    }
 +
 +    if (file_version >= 58)
 +    {
 +        gmx_fio_do_int(fio, ir->simulation_part);
 +    }
 +    else
 +    {
 +        ir->simulation_part = 1;
 +    }
 +
 +    if (file_version >= 67)
 +    {
 +        gmx_fio_do_int(fio, ir->nstcalcenergy);
 +    }
 +    else
 +    {
 +        ir->nstcalcenergy = 1;
 +    }
 +    if (file_version < 53)
 +    {
 +        /* The pbc info has been moved out of do_inputrec,
 +         * since we always want it, also without reading the inputrec.
 +         */
 +        gmx_fio_do_int(fio, ir->ePBC);
 +        if ((file_version <= 15) && (ir->ePBC == 2))
 +        {
 +            ir->ePBC = epbcNONE;
 +        }
 +        if (file_version >= 45)
 +        {
 +            gmx_fio_do_int(fio, ir->bPeriodicMols);
 +        }
 +        else
 +        {
 +            if (ir->ePBC == 2)
 +            {
 +                ir->ePBC          = epbcXYZ;
 +                ir->bPeriodicMols = TRUE;
 +            }
 +            else
 +            {
 +                ir->bPeriodicMols = FALSE;
 +            }
 +        }
 +    }
 +    if (file_version >= 81)
 +    {
 +        gmx_fio_do_int(fio, ir->cutoff_scheme);
 +    }
 +    else
 +    {
 +        ir->cutoff_scheme = ecutsGROUP;
 +    }
 +    gmx_fio_do_int(fio, ir->ns_type);
 +    gmx_fio_do_int(fio, ir->nstlist);
 +    gmx_fio_do_int(fio, ir->ndelta);
 +    if (file_version < 41)
 +    {
 +        gmx_fio_do_int(fio, idum);
 +        gmx_fio_do_int(fio, idum);
 +    }
 +    if (file_version >= 45)
 +    {
 +        gmx_fio_do_real(fio, ir->rtpi);
 +    }
 +    else
 +    {
 +        ir->rtpi = 0.05;
 +    }
 +    gmx_fio_do_int(fio, ir->nstcomm);
 +    if (file_version > 34)
 +    {
 +        gmx_fio_do_int(fio, ir->comm_mode);
 +    }
 +    else if (ir->nstcomm < 0)
 +    {
 +        ir->comm_mode = ecmANGULAR;
 +    }
 +    else
 +    {
 +        ir->comm_mode = ecmLINEAR;
 +    }
 +    ir->nstcomm = abs(ir->nstcomm);
 +
 +    if (file_version > 25)
 +    {
 +        gmx_fio_do_int(fio, ir->nstcheckpoint);
 +    }
 +    else
 +    {
 +        ir->nstcheckpoint = 0;
 +    }
 +
 +    gmx_fio_do_int(fio, ir->nstcgsteep);
 +
 +    if (file_version >= 30)
 +    {
 +        gmx_fio_do_int(fio, ir->nbfgscorr);
 +    }
 +    else if (bRead)
 +    {
 +        ir->nbfgscorr = 10;
 +    }
 +
 +    gmx_fio_do_int(fio, ir->nstlog);
 +    gmx_fio_do_int(fio, ir->nstxout);
 +    gmx_fio_do_int(fio, ir->nstvout);
 +    gmx_fio_do_int(fio, ir->nstfout);
 +    gmx_fio_do_int(fio, ir->nstenergy);
 +    gmx_fio_do_int(fio, ir->nstxtcout);
 +    if (file_version >= 59)
 +    {
 +        gmx_fio_do_double(fio, ir->init_t);
 +        gmx_fio_do_double(fio, ir->delta_t);
 +    }
 +    else
 +    {
 +        gmx_fio_do_real(fio, rdum);
 +        ir->init_t = rdum;
 +        gmx_fio_do_real(fio, rdum);
 +        ir->delta_t = rdum;
 +    }
 +    gmx_fio_do_real(fio, ir->xtcprec);
 +    if (file_version < 19)
 +    {
 +        gmx_fio_do_int(fio, idum);
 +        gmx_fio_do_int(fio, idum);
 +    }
 +    if (file_version < 18)
 +    {
 +        gmx_fio_do_int(fio, idum);
 +    }
 +    if (file_version >= 81)
 +    {
 +        gmx_fio_do_real(fio, ir->verletbuf_drift);
 +    }
 +    else
 +    {
 +        ir->verletbuf_drift = 0;
 +    }
 +    gmx_fio_do_real(fio, ir->rlist);
 +    if (file_version >= 67)
 +    {
 +        gmx_fio_do_real(fio, ir->rlistlong);
 +    }
 +    if (file_version >= 82 && file_version != 90)
 +    {
 +        gmx_fio_do_int(fio, ir->nstcalclr);
 +    }
 +    else
 +    {
 +        /* Calculate at NS steps */
 +        ir->nstcalclr = ir->nstlist;
 +    }
 +    gmx_fio_do_int(fio, ir->coulombtype);
 +    if (file_version < 32 && ir->coulombtype == eelRF)
 +    {
 +        ir->coulombtype = eelRF_NEC;
 +    }
 +    if (file_version >= 81)
 +    {
 +        gmx_fio_do_int(fio, ir->coulomb_modifier);
 +    }
 +    else
 +    {
 +        ir->coulomb_modifier = (ir->cutoff_scheme == ecutsVERLET ? eintmodPOTSHIFT : eintmodNONE);
 +    }
 +    gmx_fio_do_real(fio, ir->rcoulomb_switch);
 +    gmx_fio_do_real(fio, ir->rcoulomb);
 +    gmx_fio_do_int(fio, ir->vdwtype);
 +    if (file_version >= 81)
 +    {
 +        gmx_fio_do_int(fio, ir->vdw_modifier);
 +    }
 +    else
 +    {
 +        ir->vdw_modifier = (ir->cutoff_scheme == ecutsVERLET ? eintmodPOTSHIFT : eintmodNONE);
 +    }
 +    gmx_fio_do_real(fio, ir->rvdw_switch);
 +    gmx_fio_do_real(fio, ir->rvdw);
 +    if (file_version < 67)
 +    {
 +        ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
 +    }
 +    gmx_fio_do_int(fio, ir->eDispCorr);
 +    gmx_fio_do_real(fio, ir->epsilon_r);
 +    if (file_version >= 37)
 +    {
 +        gmx_fio_do_real(fio, ir->epsilon_rf);
 +    }
 +    else
 +    {
 +        if (EEL_RF(ir->coulombtype))
 +        {
 +            ir->epsilon_rf = ir->epsilon_r;
 +            ir->epsilon_r  = 1.0;
 +        }
 +        else
 +        {
 +            ir->epsilon_rf = 1.0;
 +        }
 +    }
 +    if (file_version >= 29)
 +    {
 +        gmx_fio_do_real(fio, ir->tabext);
 +    }
 +    else
 +    {
 +        ir->tabext = 1.0;
 +    }
 +
 +    if (file_version > 25)
 +    {
 +        gmx_fio_do_int(fio, ir->gb_algorithm);
 +        gmx_fio_do_int(fio, ir->nstgbradii);
 +        gmx_fio_do_real(fio, ir->rgbradii);
 +        gmx_fio_do_real(fio, ir->gb_saltconc);
 +        gmx_fio_do_int(fio, ir->implicit_solvent);
 +    }
 +    else
 +    {
 +        ir->gb_algorithm     = egbSTILL;
 +        ir->nstgbradii       = 1;
 +        ir->rgbradii         = 1.0;
 +        ir->gb_saltconc      = 0;
 +        ir->implicit_solvent = eisNO;
 +    }
 +    if (file_version >= 55)
 +    {
 +        gmx_fio_do_real(fio, ir->gb_epsilon_solvent);
 +        gmx_fio_do_real(fio, ir->gb_obc_alpha);
 +        gmx_fio_do_real(fio, ir->gb_obc_beta);
 +        gmx_fio_do_real(fio, ir->gb_obc_gamma);
 +        if (file_version >= 60)
 +        {
 +            gmx_fio_do_real(fio, ir->gb_dielectric_offset);
 +            gmx_fio_do_int(fio, ir->sa_algorithm);
 +        }
 +        else
 +        {
 +            ir->gb_dielectric_offset = 0.009;
 +            ir->sa_algorithm         = esaAPPROX;
 +        }
 +        gmx_fio_do_real(fio, ir->sa_surface_tension);
 +
 +        /* Override sa_surface_tension if it is not changed in the mpd-file */
 +        if (ir->sa_surface_tension < 0)
 +        {
 +            if (ir->gb_algorithm == egbSTILL)
 +            {
 +                ir->sa_surface_tension = 0.0049 * 100 * CAL2JOULE;
 +            }
 +            else if (ir->gb_algorithm == egbHCT || ir->gb_algorithm == egbOBC)
 +            {
 +                ir->sa_surface_tension = 0.0054 * 100 * CAL2JOULE;
 +            }
 +        }
 +
 +    }
 +    else
 +    {
 +        /* Better use sensible values than insane (0.0) ones... */
 +        ir->gb_epsilon_solvent = 80;
 +        ir->gb_obc_alpha       = 1.0;
 +        ir->gb_obc_beta        = 0.8;
 +        ir->gb_obc_gamma       = 4.85;
 +        ir->sa_surface_tension = 2.092;
 +    }
 +
 +
 +    if (file_version >= 81)
 +    {
 +        gmx_fio_do_real(fio, ir->fourier_spacing);
 +    }
 +    else
 +    {
 +        ir->fourier_spacing = 0.0;
 +    }
 +    gmx_fio_do_int(fio, ir->nkx);
 +    gmx_fio_do_int(fio, ir->nky);
 +    gmx_fio_do_int(fio, ir->nkz);
 +    gmx_fio_do_int(fio, ir->pme_order);
 +    gmx_fio_do_real(fio, ir->ewald_rtol);
 +
 +    if (file_version >= 24)
 +    {
 +        gmx_fio_do_int(fio, ir->ewald_geometry);
 +    }
 +    else
 +    {
 +        ir->ewald_geometry = eewg3D;
 +    }
 +
 +    if (file_version <= 17)
 +    {
 +        ir->epsilon_surface = 0;
 +        if (file_version == 17)
 +        {
 +            gmx_fio_do_int(fio, idum);
 +        }
 +    }
 +    else
 +    {
 +        gmx_fio_do_real(fio, ir->epsilon_surface);
 +    }
 +
 +    gmx_fio_do_gmx_bool(fio, ir->bOptFFT);
 +
 +    gmx_fio_do_gmx_bool(fio, ir->bContinuation);
 +    gmx_fio_do_int(fio, ir->etc);
 +    /* before version 18, ir->etc was a gmx_bool (ir->btc),
 +     * but the values 0 and 1 still mean no and
 +     * berendsen temperature coupling, respectively.
 +     */
 +    if (file_version >= 79)
 +    {
 +        gmx_fio_do_gmx_bool(fio, ir->bPrintNHChains);
 +    }
 +    if (file_version >= 71)
 +    {
 +        gmx_fio_do_int(fio, ir->nsttcouple);
 +    }
 +    else
 +    {
 +        ir->nsttcouple = ir->nstcalcenergy;
 +    }
 +    if (file_version <= 15)
 +    {
 +        gmx_fio_do_int(fio, idum);
 +    }
 +    if (file_version <= 17)
 +    {
 +        gmx_fio_do_int(fio, ir->epct);
 +        if (file_version <= 15)
 +        {
 +            if (ir->epct == 5)
 +            {
 +                ir->epct = epctSURFACETENSION;
 +            }
 +            gmx_fio_do_int(fio, idum);
 +        }
 +        ir->epct -= 1;
 +        /* we have removed the NO alternative at the beginning */
 +        if (ir->epct == -1)
 +        {
 +            ir->epc  = epcNO;
 +            ir->epct = epctISOTROPIC;
 +        }
 +        else
 +        {
 +            ir->epc = epcBERENDSEN;
 +        }
 +    }
 +    else
 +    {
 +        gmx_fio_do_int(fio, ir->epc);
 +        gmx_fio_do_int(fio, ir->epct);
 +    }
 +    if (file_version >= 71)
 +    {
 +        gmx_fio_do_int(fio, ir->nstpcouple);
 +    }
 +    else
 +    {
 +        ir->nstpcouple = ir->nstcalcenergy;
 +    }
 +    gmx_fio_do_real(fio, ir->tau_p);
 +    if (file_version <= 15)
 +    {
 +        gmx_fio_do_rvec(fio, vdum);
 +        clear_mat(ir->ref_p);
 +        for (i = 0; i < DIM; i++)
 +        {
 +            ir->ref_p[i][i] = vdum[i];
 +        }
 +    }
 +    else
 +    {
 +        gmx_fio_do_rvec(fio, ir->ref_p[XX]);
 +        gmx_fio_do_rvec(fio, ir->ref_p[YY]);
 +        gmx_fio_do_rvec(fio, ir->ref_p[ZZ]);
 +    }
 +    if (file_version <= 15)
 +    {
 +        gmx_fio_do_rvec(fio, vdum);
 +        clear_mat(ir->compress);
 +        for (i = 0; i < DIM; i++)
 +        {
 +            ir->compress[i][i] = vdum[i];
 +        }
 +    }
 +    else
 +    {
 +        gmx_fio_do_rvec(fio, ir->compress[XX]);
 +        gmx_fio_do_rvec(fio, ir->compress[YY]);
 +        gmx_fio_do_rvec(fio, ir->compress[ZZ]);
 +    }
 +    if (file_version >= 47)
 +    {
 +        gmx_fio_do_int(fio, ir->refcoord_scaling);
 +        gmx_fio_do_rvec(fio, ir->posres_com);
 +        gmx_fio_do_rvec(fio, ir->posres_comB);
 +    }
 +    else
 +    {
 +        ir->refcoord_scaling = erscNO;
 +        clear_rvec(ir->posres_com);
 +        clear_rvec(ir->posres_comB);
 +    }
 +    if ((file_version > 25) && (file_version < 79))
 +    {
 +        gmx_fio_do_int(fio, ir->andersen_seed);
 +    }
 +    else
 +    {
 +        ir->andersen_seed = 0;
 +    }
 +    if (file_version < 26)
 +    {
 +        gmx_fio_do_gmx_bool(fio, bSimAnn);
 +        gmx_fio_do_real(fio, zerotemptime);
 +    }
 +
 +    if (file_version < 37)
 +    {
 +        gmx_fio_do_real(fio, rdum);
 +    }
 +
 +    gmx_fio_do_real(fio, ir->shake_tol);
 +    if (file_version < 54)
 +    {
 +        gmx_fio_do_real(fio, *fudgeQQ);
 +    }
 +
 +    gmx_fio_do_int(fio, ir->efep);
 +    if (file_version <= 14 && ir->efep != efepNO)
 +    {
 +        ir->efep = efepYES;
 +    }
 +    do_fepvals(fio, ir->fepvals, bRead, file_version);
 +
 +    if (file_version >= 79)
 +    {
 +        gmx_fio_do_gmx_bool(fio, ir->bSimTemp);
 +        if (ir->bSimTemp)
 +        {
 +            ir->bSimTemp = TRUE;
 +        }
 +    }
 +    else
 +    {
 +        ir->bSimTemp = FALSE;
 +    }
 +    if (ir->bSimTemp)
 +    {
 +        do_simtempvals(fio, ir->simtempvals, ir->fepvals->n_lambda, bRead, file_version);
 +    }
 +
 +    if (file_version >= 79)
 +    {
 +        gmx_fio_do_gmx_bool(fio, ir->bExpanded);
 +        if (ir->bExpanded)
 +        {
 +            ir->bExpanded = TRUE;
 +        }
 +        else
 +        {
 +            ir->bExpanded = FALSE;
 +        }
 +    }
 +    if (ir->bExpanded)
 +    {
 +        do_expandedvals(fio, ir->expandedvals, ir->fepvals, bRead, file_version);
 +    }
 +    if (file_version >= 57)
 +    {
 +        gmx_fio_do_int(fio, ir->eDisre);
 +    }
 +    gmx_fio_do_int(fio, ir->eDisreWeighting);
 +    if (file_version < 22)
 +    {
 +        if (ir->eDisreWeighting == 0)
 +        {
 +            ir->eDisreWeighting = edrwEqual;
 +        }
 +        else
 +        {
 +            ir->eDisreWeighting = edrwConservative;
 +        }
 +    }
 +    gmx_fio_do_gmx_bool(fio, ir->bDisreMixed);
 +    gmx_fio_do_real(fio, ir->dr_fc);
 +    gmx_fio_do_real(fio, ir->dr_tau);
 +    gmx_fio_do_int(fio, ir->nstdisreout);
 +    if (file_version >= 22)
 +    {
 +        gmx_fio_do_real(fio, ir->orires_fc);
 +        gmx_fio_do_real(fio, ir->orires_tau);
 +        gmx_fio_do_int(fio, ir->nstorireout);
 +    }
 +    else
 +    {
 +        ir->orires_fc   = 0;
 +        ir->orires_tau  = 0;
 +        ir->nstorireout = 0;
 +    }
 +    if (file_version >= 26 && file_version < 79)
 +    {
 +        gmx_fio_do_real(fio, ir->dihre_fc);
 +        if (file_version < 56)
 +        {
 +            gmx_fio_do_real(fio, rdum);
 +            gmx_fio_do_int(fio, idum);
 +        }
 +    }
 +    else
 +    {
 +        ir->dihre_fc = 0;
 +    }
 +
 +    gmx_fio_do_real(fio, ir->em_stepsize);
 +    gmx_fio_do_real(fio, ir->em_tol);
 +    if (file_version >= 22)
 +    {
 +        gmx_fio_do_gmx_bool(fio, ir->bShakeSOR);
 +    }
 +    else if (bRead)
 +    {
 +        ir->bShakeSOR = TRUE;
 +    }
 +    if (file_version >= 11)
 +    {
 +        gmx_fio_do_int(fio, ir->niter);
 +    }
 +    else if (bRead)
 +    {
 +        ir->niter = 25;
 +        fprintf(stderr, "Note: niter not in run input file, setting it to %d\n",
 +                ir->niter);
 +    }
 +    if (file_version >= 21)
 +    {
 +        gmx_fio_do_real(fio, ir->fc_stepsize);
 +    }
 +    else
 +    {
 +        ir->fc_stepsize = 0;
 +    }
 +    gmx_fio_do_int(fio, ir->eConstrAlg);
 +    gmx_fio_do_int(fio, ir->nProjOrder);
 +    gmx_fio_do_real(fio, ir->LincsWarnAngle);
 +    if (file_version <= 14)
 +    {
 +        gmx_fio_do_int(fio, idum);
 +    }
 +    if (file_version >= 26)
 +    {
 +        gmx_fio_do_int(fio, ir->nLincsIter);
 +    }
 +    else if (bRead)
 +    {
 +        ir->nLincsIter = 1;
 +        fprintf(stderr, "Note: nLincsIter not in run input file, setting it to %d\n",
 +                ir->nLincsIter);
 +    }
 +    if (file_version < 33)
 +    {
 +        gmx_fio_do_real(fio, bd_temp);
 +    }
 +    gmx_fio_do_real(fio, ir->bd_fric);
 +    gmx_fio_do_int(fio, ir->ld_seed);
 +    if (file_version >= 33)
 +    {
 +        for (i = 0; i < DIM; i++)
 +        {
 +            gmx_fio_do_rvec(fio, ir->deform[i]);
 +        }
 +    }
 +    else
 +    {
 +        for (i = 0; i < DIM; i++)
 +        {
 +            clear_rvec(ir->deform[i]);
 +        }
 +    }
 +    if (file_version >= 14)
 +    {
 +        gmx_fio_do_real(fio, ir->cos_accel);
 +    }
 +    else if (bRead)
 +    {
 +        ir->cos_accel = 0;
 +    }
 +    gmx_fio_do_int(fio, ir->userint1);
 +    gmx_fio_do_int(fio, ir->userint2);
 +    gmx_fio_do_int(fio, ir->userint3);
 +    gmx_fio_do_int(fio, ir->userint4);
 +    gmx_fio_do_real(fio, ir->userreal1);
 +    gmx_fio_do_real(fio, ir->userreal2);
 +    gmx_fio_do_real(fio, ir->userreal3);
 +    gmx_fio_do_real(fio, ir->userreal4);
 +
 +    /* AdResS stuff */
 +    if (file_version >= 77)
 +    {
 +        gmx_fio_do_gmx_bool(fio, ir->bAdress);
 +        if (ir->bAdress)
 +        {
 +            if (bRead)
 +            {
 +                snew(ir->adress, 1);
 +            }
 +            gmx_fio_do_int(fio, ir->adress->type);
 +            gmx_fio_do_real(fio, ir->adress->const_wf);
 +            gmx_fio_do_real(fio, ir->adress->ex_width);
 +            gmx_fio_do_real(fio, ir->adress->hy_width);
 +            gmx_fio_do_int(fio, ir->adress->icor);
 +            gmx_fio_do_int(fio, ir->adress->site);
 +            gmx_fio_do_rvec(fio, ir->adress->refs);
 +            gmx_fio_do_int(fio, ir->adress->n_tf_grps);
 +            gmx_fio_do_real(fio, ir->adress->ex_forcecap);
 +            gmx_fio_do_int(fio, ir->adress->n_energy_grps);
 +            gmx_fio_do_int(fio, ir->adress->do_hybridpairs);
 +
 +            if (bRead)
 +            {
 +                snew(ir->adress->tf_table_index, ir->adress->n_tf_grps);
 +            }
 +            if (ir->adress->n_tf_grps > 0)
 +            {
 +                bDum = gmx_fio_ndo_int(fio, ir->adress->tf_table_index, ir->adress->n_tf_grps);
 +            }
 +            if (bRead)
 +            {
 +                snew(ir->adress->group_explicit, ir->adress->n_energy_grps);
 +            }
 +            if (ir->adress->n_energy_grps > 0)
 +            {
 +                bDum = gmx_fio_ndo_int(fio, ir->adress->group_explicit, ir->adress->n_energy_grps);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        ir->bAdress = FALSE;
 +    }
 +
 +    /* pull stuff */
 +    if (file_version >= 48)
 +    {
 +        gmx_fio_do_int(fio, ir->ePull);
 +        if (ir->ePull != epullNO)
 +        {
 +            if (bRead)
 +            {
 +                snew(ir->pull, 1);
 +            }
 +            do_pull(fio, ir->pull, bRead, file_version);
 +        }
 +    }
 +    else
 +    {
 +        ir->ePull = epullNO;
 +    }
 +
 +    /* Enforced rotation */
 +    if (file_version >= 74)
 +    {
 +        gmx_fio_do_int(fio, ir->bRot);
 +        if (ir->bRot == TRUE)
 +        {
 +            if (bRead)
 +            {
 +                snew(ir->rot, 1);
 +            }
 +            do_rot(fio, ir->rot, bRead, file_version);
 +        }
 +    }
 +    else
 +    {
 +        ir->bRot = FALSE;
 +    }
 +
 +    /* grpopts stuff */
 +    gmx_fio_do_int(fio, ir->opts.ngtc);
 +    if (file_version >= 69)
 +    {
 +        gmx_fio_do_int(fio, ir->opts.nhchainlength);
 +    }
 +    else
 +    {
 +        ir->opts.nhchainlength = 1;
 +    }
 +    gmx_fio_do_int(fio, ir->opts.ngacc);
 +    gmx_fio_do_int(fio, ir->opts.ngfrz);
 +    gmx_fio_do_int(fio, ir->opts.ngener);
 +
 +    if (bRead)
 +    {
 +        snew(ir->opts.nrdf,   ir->opts.ngtc);
 +        snew(ir->opts.ref_t,  ir->opts.ngtc);
 +        snew(ir->opts.annealing, ir->opts.ngtc);
 +        snew(ir->opts.anneal_npoints, ir->opts.ngtc);
 +        snew(ir->opts.anneal_time, ir->opts.ngtc);
 +        snew(ir->opts.anneal_temp, ir->opts.ngtc);
 +        snew(ir->opts.tau_t,  ir->opts.ngtc);
 +        snew(ir->opts.nFreeze, ir->opts.ngfrz);
 +        snew(ir->opts.acc,    ir->opts.ngacc);
 +        snew(ir->opts.egp_flags, ir->opts.ngener*ir->opts.ngener);
 +    }
 +    if (ir->opts.ngtc > 0)
 +    {
 +        if (bRead && file_version < 13)
 +        {
 +            snew(tmp, ir->opts.ngtc);
 +            bDum = gmx_fio_ndo_int(fio, tmp, ir->opts.ngtc);
 +            for (i = 0; i < ir->opts.ngtc; i++)
 +            {
 +                ir->opts.nrdf[i] = tmp[i];
 +            }
 +            sfree(tmp);
 +        }
 +        else
 +        {
 +            bDum = gmx_fio_ndo_real(fio, ir->opts.nrdf, ir->opts.ngtc);
 +        }
 +        bDum = gmx_fio_ndo_real(fio, ir->opts.ref_t, ir->opts.ngtc);
 +        bDum = gmx_fio_ndo_real(fio, ir->opts.tau_t, ir->opts.ngtc);
 +        if (file_version < 33 && ir->eI == eiBD)
 +        {
 +            for (i = 0; i < ir->opts.ngtc; i++)
 +            {
 +                ir->opts.tau_t[i] = bd_temp;
 +            }
 +        }
 +    }
 +    if (ir->opts.ngfrz > 0)
 +    {
 +        bDum = gmx_fio_ndo_ivec(fio, ir->opts.nFreeze, ir->opts.ngfrz);
 +    }
 +    if (ir->opts.ngacc > 0)
 +    {
 +        gmx_fio_ndo_rvec(fio, ir->opts.acc, ir->opts.ngacc);
 +    }
 +    if (file_version >= 12)
 +    {
 +        bDum = gmx_fio_ndo_int(fio, ir->opts.egp_flags,
 +                               ir->opts.ngener*ir->opts.ngener);
 +    }
 +
 +    if (bRead && file_version < 26)
 +    {
 +        for (i = 0; i < ir->opts.ngtc; i++)
 +        {
 +            if (bSimAnn)
 +            {
 +                ir->opts.annealing[i]      = eannSINGLE;
 +                ir->opts.anneal_npoints[i] = 2;
 +                snew(ir->opts.anneal_time[i], 2);
 +                snew(ir->opts.anneal_temp[i], 2);
 +                /* calculate the starting/ending temperatures from reft, zerotemptime, and nsteps */
 +                finish_t                   = ir->init_t + ir->nsteps * ir->delta_t;
 +                init_temp                  = ir->opts.ref_t[i]*(1-ir->init_t/zerotemptime);
 +                finish_temp                = ir->opts.ref_t[i]*(1-finish_t/zerotemptime);
 +                ir->opts.anneal_time[i][0] = ir->init_t;
 +                ir->opts.anneal_time[i][1] = finish_t;
 +                ir->opts.anneal_temp[i][0] = init_temp;
 +                ir->opts.anneal_temp[i][1] = finish_temp;
 +            }
 +            else
 +            {
 +                ir->opts.annealing[i]      = eannNO;
 +                ir->opts.anneal_npoints[i] = 0;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* file version 26 or later */
 +        /* First read the lists with annealing and npoints for each group */
 +        bDum = gmx_fio_ndo_int(fio, ir->opts.annealing, ir->opts.ngtc);
 +        bDum = gmx_fio_ndo_int(fio, ir->opts.anneal_npoints, ir->opts.ngtc);
 +        for (j = 0; j < (ir->opts.ngtc); j++)
 +        {
 +            k = ir->opts.anneal_npoints[j];
 +            if (bRead)
 +            {
 +                snew(ir->opts.anneal_time[j], k);
 +                snew(ir->opts.anneal_temp[j], k);
 +            }
 +            bDum = gmx_fio_ndo_real(fio, ir->opts.anneal_time[j], k);
 +            bDum = gmx_fio_ndo_real(fio, ir->opts.anneal_temp[j], k);
 +        }
 +    }
 +    /* Walls */
 +    if (file_version >= 45)
 +    {
 +        gmx_fio_do_int(fio, ir->nwall);
 +        gmx_fio_do_int(fio, ir->wall_type);
 +        if (file_version >= 50)
 +        {
 +            gmx_fio_do_real(fio, ir->wall_r_linpot);
 +        }
 +        else
 +        {
 +            ir->wall_r_linpot = -1;
 +        }
 +        gmx_fio_do_int(fio, ir->wall_atomtype[0]);
 +        gmx_fio_do_int(fio, ir->wall_atomtype[1]);
 +        gmx_fio_do_real(fio, ir->wall_density[0]);
 +        gmx_fio_do_real(fio, ir->wall_density[1]);
 +        gmx_fio_do_real(fio, ir->wall_ewald_zfac);
 +    }
 +    else
 +    {
 +        ir->nwall            = 0;
 +        ir->wall_type        = 0;
 +        ir->wall_atomtype[0] = -1;
 +        ir->wall_atomtype[1] = -1;
 +        ir->wall_density[0]  = 0;
 +        ir->wall_density[1]  = 0;
 +        ir->wall_ewald_zfac  = 3;
 +    }
 +    /* Cosine stuff for electric fields */
 +    for (j = 0; (j < DIM); j++)
 +    {
 +        gmx_fio_do_int(fio, ir->ex[j].n);
 +        gmx_fio_do_int(fio, ir->et[j].n);
 +        if (bRead)
 +        {
 +            snew(ir->ex[j].a,  ir->ex[j].n);
 +            snew(ir->ex[j].phi, ir->ex[j].n);
 +            snew(ir->et[j].a,  ir->et[j].n);
 +            snew(ir->et[j].phi, ir->et[j].n);
 +        }
 +        bDum = gmx_fio_ndo_real(fio, ir->ex[j].a,  ir->ex[j].n);
 +        bDum = gmx_fio_ndo_real(fio, ir->ex[j].phi, ir->ex[j].n);
 +        bDum = gmx_fio_ndo_real(fio, ir->et[j].a,  ir->et[j].n);
 +        bDum = gmx_fio_ndo_real(fio, ir->et[j].phi, ir->et[j].n);
 +    }
 +
 +    /* QMMM stuff */
 +    if (file_version >= 39)
 +    {
 +        gmx_fio_do_gmx_bool(fio, ir->bQMMM);
 +        gmx_fio_do_int(fio, ir->QMMMscheme);
 +        gmx_fio_do_real(fio, ir->scalefactor);
 +        gmx_fio_do_int(fio, ir->opts.ngQM);
 +        if (bRead)
 +        {
 +            snew(ir->opts.QMmethod,    ir->opts.ngQM);
 +            snew(ir->opts.QMbasis,     ir->opts.ngQM);
 +            snew(ir->opts.QMcharge,    ir->opts.ngQM);
 +            snew(ir->opts.QMmult,      ir->opts.ngQM);
 +            snew(ir->opts.bSH,         ir->opts.ngQM);
 +            snew(ir->opts.CASorbitals, ir->opts.ngQM);
 +            snew(ir->opts.CASelectrons, ir->opts.ngQM);
 +            snew(ir->opts.SAon,        ir->opts.ngQM);
 +            snew(ir->opts.SAoff,       ir->opts.ngQM);
 +            snew(ir->opts.SAsteps,     ir->opts.ngQM);
 +            snew(ir->opts.bOPT,        ir->opts.ngQM);
 +            snew(ir->opts.bTS,         ir->opts.ngQM);
 +        }
 +        if (ir->opts.ngQM > 0)
 +        {
 +            bDum = gmx_fio_ndo_int(fio, ir->opts.QMmethod, ir->opts.ngQM);
 +            bDum = gmx_fio_ndo_int(fio, ir->opts.QMbasis, ir->opts.ngQM);
 +            bDum = gmx_fio_ndo_int(fio, ir->opts.QMcharge, ir->opts.ngQM);
 +            bDum = gmx_fio_ndo_int(fio, ir->opts.QMmult, ir->opts.ngQM);
 +            bDum = gmx_fio_ndo_gmx_bool(fio, ir->opts.bSH, ir->opts.ngQM);
 +            bDum = gmx_fio_ndo_int(fio, ir->opts.CASorbitals, ir->opts.ngQM);
 +            bDum = gmx_fio_ndo_int(fio, ir->opts.CASelectrons, ir->opts.ngQM);
 +            bDum = gmx_fio_ndo_real(fio, ir->opts.SAon, ir->opts.ngQM);
 +            bDum = gmx_fio_ndo_real(fio, ir->opts.SAoff, ir->opts.ngQM);
 +            bDum = gmx_fio_ndo_int(fio, ir->opts.SAsteps, ir->opts.ngQM);
 +            bDum = gmx_fio_ndo_gmx_bool(fio, ir->opts.bOPT, ir->opts.ngQM);
 +            bDum = gmx_fio_ndo_gmx_bool(fio, ir->opts.bTS, ir->opts.ngQM);
 +        }
 +        /* end of QMMM stuff */
 +    }
 +}
 +
 +
 +static void do_harm(t_fileio *fio, t_iparams *iparams, gmx_bool bRead)
 +{
 +    gmx_fio_do_real(fio, iparams->harmonic.rA);
 +    gmx_fio_do_real(fio, iparams->harmonic.krA);
 +    gmx_fio_do_real(fio, iparams->harmonic.rB);
 +    gmx_fio_do_real(fio, iparams->harmonic.krB);
 +}
 +
 +void do_iparams(t_fileio *fio, t_functype ftype, t_iparams *iparams,
 +                gmx_bool bRead, int file_version)
 +{
 +    int      idum;
 +    gmx_bool bDum;
 +    real     rdum;
 +
 +    if (!bRead)
 +    {
 +        gmx_fio_set_comment(fio, interaction_function[ftype].name);
 +    }
 +    switch (ftype)
 +    {
 +        case F_ANGLES:
 +        case F_G96ANGLES:
 +        case F_BONDS:
 +        case F_G96BONDS:
 +        case F_HARMONIC:
 +        case F_IDIHS:
 +            do_harm(fio, iparams, bRead);
 +            if ((ftype == F_ANGRES || ftype == F_ANGRESZ) && bRead)
 +            {
 +                /* Correct incorrect storage of parameters */
 +                iparams->pdihs.phiB = iparams->pdihs.phiA;
 +                iparams->pdihs.cpB  = iparams->pdihs.cpA;
 +            }
 +            break;
 +        case F_LINEAR_ANGLES:
 +            gmx_fio_do_real(fio, iparams->linangle.klinA);
 +            gmx_fio_do_real(fio, iparams->linangle.aA);
 +            gmx_fio_do_real(fio, iparams->linangle.klinB);
 +            gmx_fio_do_real(fio, iparams->linangle.aB);
 +            break;
 +        case F_FENEBONDS:
 +            gmx_fio_do_real(fio, iparams->fene.bm);
 +            gmx_fio_do_real(fio, iparams->fene.kb);
 +            break;
 +        case F_RESTRBONDS:
 +            gmx_fio_do_real(fio, iparams->restraint.lowA);
 +            gmx_fio_do_real(fio, iparams->restraint.up1A);
 +            gmx_fio_do_real(fio, iparams->restraint.up2A);
 +            gmx_fio_do_real(fio, iparams->restraint.kA);
 +            gmx_fio_do_real(fio, iparams->restraint.lowB);
 +            gmx_fio_do_real(fio, iparams->restraint.up1B);
 +            gmx_fio_do_real(fio, iparams->restraint.up2B);
 +            gmx_fio_do_real(fio, iparams->restraint.kB);
 +            break;
 +        case F_TABBONDS:
 +        case F_TABBONDSNC:
 +        case F_TABANGLES:
 +        case F_TABDIHS:
 +            gmx_fio_do_real(fio, iparams->tab.kA);
 +            gmx_fio_do_int(fio, iparams->tab.table);
 +            gmx_fio_do_real(fio, iparams->tab.kB);
 +            break;
 +        case F_CROSS_BOND_BONDS:
 +            gmx_fio_do_real(fio, iparams->cross_bb.r1e);
 +            gmx_fio_do_real(fio, iparams->cross_bb.r2e);
 +            gmx_fio_do_real(fio, iparams->cross_bb.krr);
 +            break;
 +        case F_CROSS_BOND_ANGLES:
 +            gmx_fio_do_real(fio, iparams->cross_ba.r1e);
 +            gmx_fio_do_real(fio, iparams->cross_ba.r2e);
 +            gmx_fio_do_real(fio, iparams->cross_ba.r3e);
 +            gmx_fio_do_real(fio, iparams->cross_ba.krt);
 +            break;
 +        case F_UREY_BRADLEY:
 +            gmx_fio_do_real(fio, iparams->u_b.thetaA);
 +            gmx_fio_do_real(fio, iparams->u_b.kthetaA);
 +            gmx_fio_do_real(fio, iparams->u_b.r13A);
 +            gmx_fio_do_real(fio, iparams->u_b.kUBA);
 +            if (file_version >= 79)
 +            {
 +                gmx_fio_do_real(fio, iparams->u_b.thetaB);
 +                gmx_fio_do_real(fio, iparams->u_b.kthetaB);
 +                gmx_fio_do_real(fio, iparams->u_b.r13B);
 +                gmx_fio_do_real(fio, iparams->u_b.kUBB);
 +            }
 +            else
 +            {
 +                iparams->u_b.thetaB  = iparams->u_b.thetaA;
 +                iparams->u_b.kthetaB = iparams->u_b.kthetaA;
 +                iparams->u_b.r13B    = iparams->u_b.r13A;
 +                iparams->u_b.kUBB    = iparams->u_b.kUBA;
 +            }
 +            break;
 +        case F_QUARTIC_ANGLES:
 +            gmx_fio_do_real(fio, iparams->qangle.theta);
 +            bDum = gmx_fio_ndo_real(fio, iparams->qangle.c, 5);
 +            break;
 +        case F_BHAM:
 +            gmx_fio_do_real(fio, iparams->bham.a);
 +            gmx_fio_do_real(fio, iparams->bham.b);
 +            gmx_fio_do_real(fio, iparams->bham.c);
 +            break;
 +        case F_MORSE:
 +            gmx_fio_do_real(fio, iparams->morse.b0A);
 +            gmx_fio_do_real(fio, iparams->morse.cbA);
 +            gmx_fio_do_real(fio, iparams->morse.betaA);
 +            if (file_version >= 79)
 +            {
 +                gmx_fio_do_real(fio, iparams->morse.b0B);
 +                gmx_fio_do_real(fio, iparams->morse.cbB);
 +                gmx_fio_do_real(fio, iparams->morse.betaB);
 +            }
 +            else
 +            {
 +                iparams->morse.b0B   = iparams->morse.b0A;
 +                iparams->morse.cbB   = iparams->morse.cbA;
 +                iparams->morse.betaB = iparams->morse.betaA;
 +            }
 +            break;
 +        case F_CUBICBONDS:
 +            gmx_fio_do_real(fio, iparams->cubic.b0);
 +            gmx_fio_do_real(fio, iparams->cubic.kb);
 +            gmx_fio_do_real(fio, iparams->cubic.kcub);
 +            break;
 +        case F_CONNBONDS:
 +            break;
 +        case F_POLARIZATION:
 +            gmx_fio_do_real(fio, iparams->polarize.alpha);
 +            break;
 +        case F_ANHARM_POL:
 +            gmx_fio_do_real(fio, iparams->anharm_polarize.alpha);
 +            gmx_fio_do_real(fio, iparams->anharm_polarize.drcut);
 +            gmx_fio_do_real(fio, iparams->anharm_polarize.khyp);
 +            break;
 +        case F_WATER_POL:
 +            if (file_version < 31)
 +            {
 +                gmx_fatal(FARGS, "Old tpr files with water_polarization not supported. Make a new.");
 +            }
 +            gmx_fio_do_real(fio, iparams->wpol.al_x);
 +            gmx_fio_do_real(fio, iparams->wpol.al_y);
 +            gmx_fio_do_real(fio, iparams->wpol.al_z);
 +            gmx_fio_do_real(fio, iparams->wpol.rOH);
 +            gmx_fio_do_real(fio, iparams->wpol.rHH);
 +            gmx_fio_do_real(fio, iparams->wpol.rOD);
 +            break;
 +        case F_THOLE_POL:
 +            gmx_fio_do_real(fio, iparams->thole.a);
 +            gmx_fio_do_real(fio, iparams->thole.alpha1);
 +            gmx_fio_do_real(fio, iparams->thole.alpha2);
 +            gmx_fio_do_real(fio, iparams->thole.rfac);
 +            break;
 +        case F_LJ:
 +            gmx_fio_do_real(fio, iparams->lj.c6);
 +            gmx_fio_do_real(fio, iparams->lj.c12);
 +            break;
 +        case F_LJ14:
 +            gmx_fio_do_real(fio, iparams->lj14.c6A);
 +            gmx_fio_do_real(fio, iparams->lj14.c12A);
 +            gmx_fio_do_real(fio, iparams->lj14.c6B);
 +            gmx_fio_do_real(fio, iparams->lj14.c12B);
 +            break;
 +        case F_LJC14_Q:
 +            gmx_fio_do_real(fio, iparams->ljc14.fqq);
 +            gmx_fio_do_real(fio, iparams->ljc14.qi);
 +            gmx_fio_do_real(fio, iparams->ljc14.qj);
 +            gmx_fio_do_real(fio, iparams->ljc14.c6);
 +            gmx_fio_do_real(fio, iparams->ljc14.c12);
 +            break;
 +        case F_LJC_PAIRS_NB:
 +            gmx_fio_do_real(fio, iparams->ljcnb.qi);
 +            gmx_fio_do_real(fio, iparams->ljcnb.qj);
 +            gmx_fio_do_real(fio, iparams->ljcnb.c6);
 +            gmx_fio_do_real(fio, iparams->ljcnb.c12);
 +            break;
 +        case F_PDIHS:
 +        case F_PIDIHS:
 +        case F_ANGRES:
 +        case F_ANGRESZ:
 +            gmx_fio_do_real(fio, iparams->pdihs.phiA);
 +            gmx_fio_do_real(fio, iparams->pdihs.cpA);
 +            if ((ftype == F_ANGRES || ftype == F_ANGRESZ) && file_version < 42)
 +            {
 +                /* Read the incorrectly stored multiplicity */
 +                gmx_fio_do_real(fio, iparams->harmonic.rB);
 +                gmx_fio_do_real(fio, iparams->harmonic.krB);
 +                iparams->pdihs.phiB = iparams->pdihs.phiA;
 +                iparams->pdihs.cpB  = iparams->pdihs.cpA;
 +            }
 +            else
 +            {
 +                gmx_fio_do_real(fio, iparams->pdihs.phiB);
 +                gmx_fio_do_real(fio, iparams->pdihs.cpB);
 +                gmx_fio_do_int(fio, iparams->pdihs.mult);
 +            }
 +            break;
 +        case F_DISRES:
 +            gmx_fio_do_int(fio, iparams->disres.label);
 +            gmx_fio_do_int(fio, iparams->disres.type);
 +            gmx_fio_do_real(fio, iparams->disres.low);
 +            gmx_fio_do_real(fio, iparams->disres.up1);
 +            gmx_fio_do_real(fio, iparams->disres.up2);
 +            gmx_fio_do_real(fio, iparams->disres.kfac);
 +            break;
 +        case F_ORIRES:
 +            gmx_fio_do_int(fio, iparams->orires.ex);
 +            gmx_fio_do_int(fio, iparams->orires.label);
 +            gmx_fio_do_int(fio, iparams->orires.power);
 +            gmx_fio_do_real(fio, iparams->orires.c);
 +            gmx_fio_do_real(fio, iparams->orires.obs);
 +            gmx_fio_do_real(fio, iparams->orires.kfac);
 +            break;
 +        case F_DIHRES:
 +            if (file_version < 72)
 +            {
 +                gmx_fio_do_int(fio, idum);
 +                gmx_fio_do_int(fio, idum);
 +            }
 +            gmx_fio_do_real(fio, iparams->dihres.phiA);
 +            gmx_fio_do_real(fio, iparams->dihres.dphiA);
 +            gmx_fio_do_real(fio, iparams->dihres.kfacA);
 +            if (file_version >= 72)
 +            {
 +                gmx_fio_do_real(fio, iparams->dihres.phiB);
 +                gmx_fio_do_real(fio, iparams->dihres.dphiB);
 +                gmx_fio_do_real(fio, iparams->dihres.kfacB);
 +            }
 +            else
 +            {
 +                iparams->dihres.phiB  = iparams->dihres.phiA;
 +                iparams->dihres.dphiB = iparams->dihres.dphiA;
 +                iparams->dihres.kfacB = iparams->dihres.kfacA;
 +            }
 +            break;
 +        case F_POSRES:
 +            gmx_fio_do_rvec(fio, iparams->posres.pos0A);
 +            gmx_fio_do_rvec(fio, iparams->posres.fcA);
 +            if (bRead && file_version < 27)
 +            {
 +                copy_rvec(iparams->posres.pos0A, iparams->posres.pos0B);
 +                copy_rvec(iparams->posres.fcA, iparams->posres.fcB);
 +            }
 +            else
 +            {
 +                gmx_fio_do_rvec(fio, iparams->posres.pos0B);
 +                gmx_fio_do_rvec(fio, iparams->posres.fcB);
 +            }
 +            break;
 +        case F_FBPOSRES:
 +            gmx_fio_do_int(fio, iparams->fbposres.geom);
 +            gmx_fio_do_rvec(fio, iparams->fbposres.pos0);
 +            gmx_fio_do_real(fio, iparams->fbposres.r);
 +            gmx_fio_do_real(fio, iparams->fbposres.k);
 +            break;
 +        case F_RBDIHS:
 +            bDum = gmx_fio_ndo_real(fio, iparams->rbdihs.rbcA, NR_RBDIHS);
 +            if (file_version >= 25)
 +            {
 +                bDum = gmx_fio_ndo_real(fio, iparams->rbdihs.rbcB, NR_RBDIHS);
 +            }
 +            break;
 +        case F_FOURDIHS:
 +            /* Fourier dihedrals are internally represented
 +             * as Ryckaert-Bellemans since those are faster to compute.
 +             */
 +            bDum = gmx_fio_ndo_real(fio, iparams->rbdihs.rbcA, NR_RBDIHS);
 +            bDum = gmx_fio_ndo_real(fio, iparams->rbdihs.rbcB, NR_RBDIHS);
 +            break;
 +        case F_CONSTR:
 +        case F_CONSTRNC:
 +            gmx_fio_do_real(fio, iparams->constr.dA);
 +            gmx_fio_do_real(fio, iparams->constr.dB);
 +            break;
 +        case F_SETTLE:
 +            gmx_fio_do_real(fio, iparams->settle.doh);
 +            gmx_fio_do_real(fio, iparams->settle.dhh);
 +            break;
 +        case F_VSITE2:
 +            gmx_fio_do_real(fio, iparams->vsite.a);
 +            break;
 +        case F_VSITE3:
 +        case F_VSITE3FD:
 +        case F_VSITE3FAD:
 +            gmx_fio_do_real(fio, iparams->vsite.a);
 +            gmx_fio_do_real(fio, iparams->vsite.b);
 +            break;
 +        case F_VSITE3OUT:
 +        case F_VSITE4FD:
 +        case F_VSITE4FDN:
 +            gmx_fio_do_real(fio, iparams->vsite.a);
 +            gmx_fio_do_real(fio, iparams->vsite.b);
 +            gmx_fio_do_real(fio, iparams->vsite.c);
 +            break;
 +        case F_VSITEN:
 +            gmx_fio_do_int(fio, iparams->vsiten.n);
 +            gmx_fio_do_real(fio, iparams->vsiten.a);
 +            break;
 +        case F_GB12:
 +        case F_GB13:
 +        case F_GB14:
 +            /* We got rid of some parameters in version 68 */
 +            if (bRead && file_version < 68)
 +            {
 +                gmx_fio_do_real(fio, rdum);
 +                gmx_fio_do_real(fio, rdum);
 +                gmx_fio_do_real(fio, rdum);
 +                gmx_fio_do_real(fio, rdum);
 +            }
 +            gmx_fio_do_real(fio, iparams->gb.sar);
 +            gmx_fio_do_real(fio, iparams->gb.st);
 +            gmx_fio_do_real(fio, iparams->gb.pi);
 +            gmx_fio_do_real(fio, iparams->gb.gbr);
 +            gmx_fio_do_real(fio, iparams->gb.bmlt);
 +            break;
 +        case F_CMAP:
 +            gmx_fio_do_int(fio, iparams->cmap.cmapA);
 +            gmx_fio_do_int(fio, iparams->cmap.cmapB);
 +            break;
 +        default:
 +            gmx_fatal(FARGS, "unknown function type %d (%s) in %s line %d",
 +                      ftype, interaction_function[ftype].name, __FILE__, __LINE__);
 +    }
 +    if (!bRead)
 +    {
 +        gmx_fio_unset_comment(fio);
 +    }
 +}
 +
 +static void do_ilist(t_fileio *fio, t_ilist *ilist, gmx_bool bRead, int file_version,
 +                     int ftype)
 +{
 +    int      i, k, idum;
 +    gmx_bool bDum = TRUE;
 +
 +    if (!bRead)
 +    {
 +        gmx_fio_set_comment(fio, interaction_function[ftype].name);
 +    }
 +    if (file_version < 44)
 +    {
 +        for (i = 0; i < MAXNODES; i++)
 +        {
 +            gmx_fio_do_int(fio, idum);
 +        }
 +    }
 +    gmx_fio_do_int(fio, ilist->nr);
 +    if (bRead)
 +    {
 +        snew(ilist->iatoms, ilist->nr);
 +    }
 +    bDum = gmx_fio_ndo_int(fio, ilist->iatoms, ilist->nr);
 +    if (!bRead)
 +    {
 +        gmx_fio_unset_comment(fio);
 +    }
 +}
 +
 +static void do_ffparams(t_fileio *fio, gmx_ffparams_t *ffparams,
 +                        gmx_bool bRead, int file_version)
 +{
 +    int          idum, i, j;
 +    gmx_bool     bDum = TRUE;
 +    unsigned int k;
 +
 +    gmx_fio_do_int(fio, ffparams->atnr);
 +    if (file_version < 57)
 +    {
 +        gmx_fio_do_int(fio, idum);
 +    }
 +    gmx_fio_do_int(fio, ffparams->ntypes);
 +    if (bRead && debug)
 +    {
 +        fprintf(debug, "ffparams->atnr = %d, ntypes = %d\n",
 +                ffparams->atnr, ffparams->ntypes);
 +    }
 +    if (bRead)
 +    {
 +        snew(ffparams->functype, ffparams->ntypes);
 +        snew(ffparams->iparams, ffparams->ntypes);
 +    }
 +    /* Read/write all the function types */
 +    bDum = gmx_fio_ndo_int(fio, ffparams->functype, ffparams->ntypes);
 +    if (bRead && debug)
 +    {
 +        pr_ivec(debug, 0, "functype", ffparams->functype, ffparams->ntypes, TRUE);
 +    }
 +
 +    if (file_version >= 66)
 +    {
 +        gmx_fio_do_double(fio, ffparams->reppow);
 +    }
 +    else
 +    {
 +        ffparams->reppow = 12.0;
 +    }
 +
 +    if (file_version >= 57)
 +    {
 +        gmx_fio_do_real(fio, ffparams->fudgeQQ);
 +    }
 +
 +    /* Check whether all these function types are supported by the code.
 +     * In practice the code is backwards compatible, which means that the
 +     * numbering may have to be altered from old numbering to new numbering
 +     */
 +    for (i = 0; (i < ffparams->ntypes); i++)
 +    {
 +        if (bRead)
 +        {
 +            /* Loop over file versions */
 +            for (k = 0; (k < NFTUPD); k++)
 +            {
 +                /* Compare the read file_version to the update table */
 +                if ((file_version < ftupd[k].fvnr) &&
 +                    (ffparams->functype[i] >= ftupd[k].ftype))
 +                {
 +                    ffparams->functype[i] += 1;
 +                    if (debug)
 +                    {
 +                        fprintf(debug, "Incrementing function type %d to %d (due to %s)\n",
 +                                i, ffparams->functype[i],
 +                                interaction_function[ftupd[k].ftype].longname);
 +                        fflush(debug);
 +                    }
 +                }
 +            }
 +        }
 +
 +        do_iparams(fio, ffparams->functype[i], &ffparams->iparams[i], bRead,
 +                   file_version);
 +        if (bRead && debug)
 +        {
 +            pr_iparams(debug, ffparams->functype[i], &ffparams->iparams[i]);
 +        }
 +    }
 +}
 +
 +static void add_settle_atoms(t_ilist *ilist)
 +{
 +    int i;
 +
 +    /* Settle used to only store the first atom: add the other two */
 +    srenew(ilist->iatoms, 2*ilist->nr);
 +    for (i = ilist->nr/2-1; i >= 0; i--)
 +    {
 +        ilist->iatoms[4*i+0] = ilist->iatoms[2*i+0];
 +        ilist->iatoms[4*i+1] = ilist->iatoms[2*i+1];
 +        ilist->iatoms[4*i+2] = ilist->iatoms[2*i+1] + 1;
 +        ilist->iatoms[4*i+3] = ilist->iatoms[2*i+1] + 2;
 +    }
 +    ilist->nr = 2*ilist->nr;
 +}
 +
 +static void do_ilists(t_fileio *fio, t_ilist *ilist, gmx_bool bRead,
 +                      int file_version)
 +{
 +    int          i, j, renum[F_NRE];
 +    gmx_bool     bDum = TRUE, bClear;
 +    unsigned int k;
 +
 +    for (j = 0; (j < F_NRE); j++)
 +    {
 +        bClear = FALSE;
 +        if (bRead)
 +        {
 +            for (k = 0; k < NFTUPD; k++)
 +            {
 +                if ((file_version < ftupd[k].fvnr) && (j == ftupd[k].ftype))
 +                {
 +                    bClear = TRUE;
 +                }
 +            }
 +        }
 +        if (bClear)
 +        {
 +            ilist[j].nr     = 0;
 +            ilist[j].iatoms = NULL;
 +        }
 +        else
 +        {
 +            do_ilist(fio, &ilist[j], bRead, file_version, j);
 +            if (file_version < 78 && j == F_SETTLE && ilist[j].nr > 0)
 +            {
 +                add_settle_atoms(&ilist[j]);
 +            }
 +        }
 +        /*
 +           if (bRead && gmx_debug_at)
 +           pr_ilist(debug,0,interaction_function[j].longname,
 +               functype,&ilist[j],TRUE);
 +         */
 +    }
 +}
 +
 +static void do_idef(t_fileio *fio, gmx_ffparams_t *ffparams, gmx_moltype_t *molt,
 +                    gmx_bool bRead, int file_version)
 +{
 +    do_ffparams(fio, ffparams, bRead, file_version);
 +
 +    if (file_version >= 54)
 +    {
 +        gmx_fio_do_real(fio, ffparams->fudgeQQ);
 +    }
 +
 +    do_ilists(fio, molt->ilist, bRead, file_version);
 +}
 +
 +static void do_block(t_fileio *fio, t_block *block, gmx_bool bRead, int file_version)
 +{
 +    int      i, idum, dum_nra, *dum_a;
 +    gmx_bool bDum = TRUE;
 +
 +    if (file_version < 44)
 +    {
 +        for (i = 0; i < MAXNODES; i++)
 +        {
 +            gmx_fio_do_int(fio, idum);
 +        }
 +    }
 +    gmx_fio_do_int(fio, block->nr);
 +    if (file_version < 51)
 +    {
 +        gmx_fio_do_int(fio, dum_nra);
 +    }
 +    if (bRead)
 +    {
 +        block->nalloc_index = block->nr+1;
 +        snew(block->index, block->nalloc_index);
 +    }
 +    bDum = gmx_fio_ndo_int(fio, block->index, block->nr+1);
 +
 +    if (file_version < 51 && dum_nra > 0)
 +    {
 +        snew(dum_a, dum_nra);
 +        bDum = gmx_fio_ndo_int(fio, dum_a, dum_nra);
 +        sfree(dum_a);
 +    }
 +}
 +
 +static void do_blocka(t_fileio *fio, t_blocka *block, gmx_bool bRead,
 +                      int file_version)
 +{
 +    int      i, idum;
 +    gmx_bool bDum = TRUE;
 +
 +    if (file_version < 44)
 +    {
 +        for (i = 0; i < MAXNODES; i++)
 +        {
 +            gmx_fio_do_int(fio, idum);
 +        }
 +    }
 +    gmx_fio_do_int(fio, block->nr);
 +    gmx_fio_do_int(fio, block->nra);
 +    if (bRead)
 +    {
 +        block->nalloc_index = block->nr+1;
 +        snew(block->index, block->nalloc_index);
 +        block->nalloc_a = block->nra;
 +        snew(block->a, block->nalloc_a);
 +    }
 +    bDum = gmx_fio_ndo_int(fio, block->index, block->nr+1);
 +    bDum = gmx_fio_ndo_int(fio, block->a, block->nra);
 +}
 +
 +static void do_atom(t_fileio *fio, t_atom *atom, int ngrp, gmx_bool bRead,
 +                    int file_version, gmx_groups_t *groups, int atnr)
 +{
 +    int i, myngrp;
 +
 +    gmx_fio_do_real(fio, atom->m);
 +    gmx_fio_do_real(fio, atom->q);
 +    gmx_fio_do_real(fio, atom->mB);
 +    gmx_fio_do_real(fio, atom->qB);
 +    gmx_fio_do_ushort(fio, atom->type);
 +    gmx_fio_do_ushort(fio, atom->typeB);
 +    gmx_fio_do_int(fio, atom->ptype);
 +    gmx_fio_do_int(fio, atom->resind);
 +    if (file_version >= 52)
 +    {
 +        gmx_fio_do_int(fio, atom->atomnumber);
 +    }
 +    else if (bRead)
 +    {
 +        atom->atomnumber = NOTSET;
 +    }
 +    if (file_version < 23)
 +    {
 +        myngrp = 8;
 +    }
 +    else if (file_version < 39)
 +    {
 +        myngrp = 9;
 +    }
 +    else
 +    {
 +        myngrp = ngrp;
 +    }
 +
 +    if (file_version < 57)
 +    {
 +        unsigned char uchar[egcNR];
 +        gmx_fio_ndo_uchar(fio, uchar, myngrp);
 +        for (i = myngrp; (i < ngrp); i++)
 +        {
 +            uchar[i] = 0;
 +        }
 +        /* Copy the old data format to the groups struct */
 +        for (i = 0; i < ngrp; i++)
 +        {
 +            groups->grpnr[i][atnr] = uchar[i];
 +        }
 +    }
 +}
 +
 +static void do_grps(t_fileio *fio, int ngrp, t_grps grps[], gmx_bool bRead,
 +                    int file_version)
 +{
 +    int      i, j, myngrp;
 +    gmx_bool bDum = TRUE;
 +
 +    if (file_version < 23)
 +    {
 +        myngrp = 8;
 +    }
 +    else if (file_version < 39)
 +    {
 +        myngrp = 9;
 +    }
 +    else
 +    {
 +        myngrp = ngrp;
 +    }
 +
 +    for (j = 0; (j < ngrp); j++)
 +    {
 +        if (j < myngrp)
 +        {
 +            gmx_fio_do_int(fio, grps[j].nr);
 +            if (bRead)
 +            {
 +                snew(grps[j].nm_ind, grps[j].nr);
 +            }
 +            bDum = gmx_fio_ndo_int(fio, grps[j].nm_ind, grps[j].nr);
 +        }
 +        else
 +        {
 +            grps[j].nr = 1;
 +            snew(grps[j].nm_ind, grps[j].nr);
 +        }
 +    }
 +}
 +
 +static void do_symstr(t_fileio *fio, char ***nm, gmx_bool bRead, t_symtab *symtab)
 +{
 +    int ls;
 +
 +    if (bRead)
 +    {
 +        gmx_fio_do_int(fio, ls);
 +        *nm = get_symtab_handle(symtab, ls);
 +    }
 +    else
 +    {
 +        ls = lookup_symtab(symtab, *nm);
 +        gmx_fio_do_int(fio, ls);
 +    }
 +}
 +
 +static void do_strstr(t_fileio *fio, int nstr, char ***nm, gmx_bool bRead,
 +                      t_symtab *symtab)
 +{
 +    int  j;
 +
 +    for (j = 0; (j < nstr); j++)
 +    {
 +        do_symstr(fio, &(nm[j]), bRead, symtab);
 +    }
 +}
 +
 +static void do_resinfo(t_fileio *fio, int n, t_resinfo *ri, gmx_bool bRead,
 +                       t_symtab *symtab, int file_version)
 +{
 +    int  j;
 +
 +    for (j = 0; (j < n); j++)
 +    {
 +        do_symstr(fio, &(ri[j].name), bRead, symtab);
 +        if (file_version >= 63)
 +        {
 +            gmx_fio_do_int(fio, ri[j].nr);
 +            gmx_fio_do_uchar(fio, ri[j].ic);
 +        }
 +        else
 +        {
 +            ri[j].nr = j + 1;
 +            ri[j].ic = ' ';
 +        }
 +    }
 +}
 +
 +static void do_atoms(t_fileio *fio, t_atoms *atoms, gmx_bool bRead, t_symtab *symtab,
 +                     int file_version,
 +                     gmx_groups_t *groups)
 +{
 +    int i;
 +
 +    gmx_fio_do_int(fio, atoms->nr);
 +    gmx_fio_do_int(fio, atoms->nres);
 +    if (file_version < 57)
 +    {
 +        gmx_fio_do_int(fio, groups->ngrpname);
 +        for (i = 0; i < egcNR; i++)
 +        {
 +            groups->ngrpnr[i] = atoms->nr;
 +            snew(groups->grpnr[i], groups->ngrpnr[i]);
 +        }
 +    }
 +    if (bRead)
 +    {
 +        snew(atoms->atom, atoms->nr);
 +        snew(atoms->atomname, atoms->nr);
 +        snew(atoms->atomtype, atoms->nr);
 +        snew(atoms->atomtypeB, atoms->nr);
 +        snew(atoms->resinfo, atoms->nres);
 +        if (file_version < 57)
 +        {
 +            snew(groups->grpname, groups->ngrpname);
 +        }
 +        atoms->pdbinfo = NULL;
 +    }
 +    for (i = 0; (i < atoms->nr); i++)
 +    {
 +        do_atom(fio, &atoms->atom[i], egcNR, bRead, file_version, groups, i);
 +    }
 +    do_strstr(fio, atoms->nr, atoms->atomname, bRead, symtab);
 +    if (bRead && (file_version <= 20))
 +    {
 +        for (i = 0; i < atoms->nr; i++)
 +        {
 +            atoms->atomtype[i]  = put_symtab(symtab, "?");
 +            atoms->atomtypeB[i] = put_symtab(symtab, "?");
 +        }
 +    }
 +    else
 +    {
 +        do_strstr(fio, atoms->nr, atoms->atomtype, bRead, symtab);
 +        do_strstr(fio, atoms->nr, atoms->atomtypeB, bRead, symtab);
 +    }
 +    do_resinfo(fio, atoms->nres, atoms->resinfo, bRead, symtab, file_version);
 +
 +    if (file_version < 57)
 +    {
 +        do_strstr(fio, groups->ngrpname, groups->grpname, bRead, symtab);
 +
 +        do_grps(fio, egcNR, groups->grps, bRead, file_version);
 +    }
 +}
 +
 +static void do_groups(t_fileio *fio, gmx_groups_t *groups,
 +                      gmx_bool bRead, t_symtab *symtab,
 +                      int file_version)
 +{
 +    int      g, n, i;
 +    gmx_bool bDum = TRUE;
 +
 +    do_grps(fio, egcNR, groups->grps, bRead, file_version);
 +    gmx_fio_do_int(fio, groups->ngrpname);
 +    if (bRead)
 +    {
 +        snew(groups->grpname, groups->ngrpname);
 +    }
 +    do_strstr(fio, groups->ngrpname, groups->grpname, bRead, symtab);
 +    for (g = 0; g < egcNR; g++)
 +    {
 +        gmx_fio_do_int(fio, groups->ngrpnr[g]);
 +        if (groups->ngrpnr[g] == 0)
 +        {
 +            if (bRead)
 +            {
 +                groups->grpnr[g] = NULL;
 +            }
 +        }
 +        else
 +        {
 +            if (bRead)
 +            {
 +                snew(groups->grpnr[g], groups->ngrpnr[g]);
 +            }
 +            bDum = gmx_fio_ndo_uchar(fio, groups->grpnr[g], groups->ngrpnr[g]);
 +        }
 +    }
 +}
 +
 +static void do_atomtypes(t_fileio *fio, t_atomtypes *atomtypes, gmx_bool bRead,
 +                         t_symtab *symtab, int file_version)
 +{
 +    int      i, j;
 +    gmx_bool bDum = TRUE;
 +
 +    if (file_version > 25)
 +    {
 +        gmx_fio_do_int(fio, atomtypes->nr);
 +        j = atomtypes->nr;
 +        if (bRead)
 +        {
 +            snew(atomtypes->radius, j);
 +            snew(atomtypes->vol, j);
 +            snew(atomtypes->surftens, j);
 +            snew(atomtypes->atomnumber, j);
 +            snew(atomtypes->gb_radius, j);
 +            snew(atomtypes->S_hct, j);
 +        }
 +        bDum = gmx_fio_ndo_real(fio, atomtypes->radius, j);
 +        bDum = gmx_fio_ndo_real(fio, atomtypes->vol, j);
 +        bDum = gmx_fio_ndo_real(fio, atomtypes->surftens, j);
 +        if (file_version >= 40)
 +        {
 +            bDum = gmx_fio_ndo_int(fio, atomtypes->atomnumber, j);
 +        }
 +        if (file_version >= 60)
 +        {
 +            bDum = gmx_fio_ndo_real(fio, atomtypes->gb_radius, j);
 +            bDum = gmx_fio_ndo_real(fio, atomtypes->S_hct, j);
 +        }
 +    }
 +    else
 +    {
 +        /* File versions prior to 26 cannot do GBSA,
 +         * so they dont use this structure
 +         */
 +        atomtypes->nr         = 0;
 +        atomtypes->radius     = NULL;
 +        atomtypes->vol        = NULL;
 +        atomtypes->surftens   = NULL;
 +        atomtypes->atomnumber = NULL;
 +        atomtypes->gb_radius  = NULL;
 +        atomtypes->S_hct      = NULL;
 +    }
 +}
 +
 +static void do_symtab(t_fileio *fio, t_symtab *symtab, gmx_bool bRead)
 +{
 +    int       i, nr;
 +    t_symbuf *symbuf;
 +    char      buf[STRLEN];
 +
 +    gmx_fio_do_int(fio, symtab->nr);
 +    nr     = symtab->nr;
 +    if (bRead)
 +    {
 +        snew(symtab->symbuf, 1);
 +        symbuf          = symtab->symbuf;
 +        symbuf->bufsize = nr;
 +        snew(symbuf->buf, nr);
 +        for (i = 0; (i < nr); i++)
 +        {
 +            gmx_fio_do_string(fio, buf);
 +            symbuf->buf[i] = strdup(buf);
 +        }
 +    }
 +    else
 +    {
 +        symbuf = symtab->symbuf;
 +        while (symbuf != NULL)
 +        {
 +            for (i = 0; (i < symbuf->bufsize) && (i < nr); i++)
 +            {
 +                gmx_fio_do_string(fio, symbuf->buf[i]);
 +            }
 +            nr    -= i;
 +            symbuf = symbuf->next;
 +        }
 +        if (nr != 0)
 +        {
 +            gmx_fatal(FARGS, "nr of symtab strings left: %d", nr);
 +        }
 +    }
 +}
 +
 +static void do_cmap(t_fileio *fio, gmx_cmap_t *cmap_grid, gmx_bool bRead)
 +{
 +    int i, j, ngrid, gs, nelem;
 +
 +    gmx_fio_do_int(fio, cmap_grid->ngrid);
 +    gmx_fio_do_int(fio, cmap_grid->grid_spacing);
 +
 +    ngrid = cmap_grid->ngrid;
 +    gs    = cmap_grid->grid_spacing;
 +    nelem = gs * gs;
 +
 +    if (bRead)
 +    {
 +        snew(cmap_grid->cmapdata, ngrid);
 +
 +        for (i = 0; i < cmap_grid->ngrid; i++)
 +        {
 +            snew(cmap_grid->cmapdata[i].cmap, 4*nelem);
 +        }
 +    }
 +
 +    for (i = 0; i < cmap_grid->ngrid; i++)
 +    {
 +        for (j = 0; j < nelem; j++)
 +        {
 +            gmx_fio_do_real(fio, cmap_grid->cmapdata[i].cmap[j*4]);
 +            gmx_fio_do_real(fio, cmap_grid->cmapdata[i].cmap[j*4+1]);
 +            gmx_fio_do_real(fio, cmap_grid->cmapdata[i].cmap[j*4+2]);
 +            gmx_fio_do_real(fio, cmap_grid->cmapdata[i].cmap[j*4+3]);
 +        }
 +    }
 +}
 +
 +
 +void tpx_make_chain_identifiers(t_atoms *atoms, t_block *mols)
 +{
 +    int  m, a, a0, a1, r;
 +    char c, chainid;
 +    int  chainnum;
 +
 +    /* We always assign a new chain number, but save the chain id characters
 +     * for larger molecules.
 +     */
 +#define CHAIN_MIN_ATOMS 15
 +
 +    chainnum = 0;
 +    chainid  = 'A';
 +    for (m = 0; m < mols->nr; m++)
 +    {
 +        a0 = mols->index[m];
 +        a1 = mols->index[m+1];
 +        if ((a1-a0 >= CHAIN_MIN_ATOMS) && (chainid <= 'Z'))
 +        {
 +            c = chainid;
 +            chainid++;
 +        }
 +        else
 +        {
 +            c = ' ';
 +        }
 +        for (a = a0; a < a1; a++)
 +        {
 +            atoms->resinfo[atoms->atom[a].resind].chainnum = chainnum;
 +            atoms->resinfo[atoms->atom[a].resind].chainid  = c;
 +        }
 +        chainnum++;
 +    }
 +
 +    /* Blank out the chain id if there was only one chain */
 +    if (chainid == 'B')
 +    {
 +        for (r = 0; r < atoms->nres; r++)
 +        {
 +            atoms->resinfo[r].chainid = ' ';
 +        }
 +    }
 +}
 +
 +static void do_moltype(t_fileio *fio, gmx_moltype_t *molt, gmx_bool bRead,
 +                       t_symtab *symtab, int file_version,
 +                       gmx_groups_t *groups)
 +{
 +    int i;
 +
 +    if (file_version >= 57)
 +    {
 +        do_symstr(fio, &(molt->name), bRead, symtab);
 +    }
 +
 +    do_atoms(fio, &molt->atoms, bRead, symtab, file_version, groups);
 +
 +    if (bRead && gmx_debug_at)
 +    {
 +        pr_atoms(debug, 0, "atoms", &molt->atoms, TRUE);
 +    }
 +
 +    if (file_version >= 57)
 +    {
 +        do_ilists(fio, molt->ilist, bRead, file_version);
 +
 +        do_block(fio, &molt->cgs, bRead, file_version);
 +        if (bRead && gmx_debug_at)
 +        {
 +            pr_block(debug, 0, "cgs", &molt->cgs, TRUE);
 +        }
 +    }
 +
 +    /* This used to be in the atoms struct */
 +    do_blocka(fio, &molt->excls, bRead, file_version);
 +}
 +
 +static void do_molblock(t_fileio *fio, gmx_molblock_t *molb, gmx_bool bRead,
 +                        int file_version)
 +{
 +    int i;
 +
 +    gmx_fio_do_int(fio, molb->type);
 +    gmx_fio_do_int(fio, molb->nmol);
 +    gmx_fio_do_int(fio, molb->natoms_mol);
 +    /* Position restraint coordinates */
 +    gmx_fio_do_int(fio, molb->nposres_xA);
 +    if (molb->nposres_xA > 0)
 +    {
 +        if (bRead)
 +        {
 +            snew(molb->posres_xA, molb->nposres_xA);
 +        }
 +        gmx_fio_ndo_rvec(fio, molb->posres_xA, molb->nposres_xA);
 +    }
 +    gmx_fio_do_int(fio, molb->nposres_xB);
 +    if (molb->nposres_xB > 0)
 +    {
 +        if (bRead)
 +        {
 +            snew(molb->posres_xB, molb->nposres_xB);
 +        }
 +        gmx_fio_ndo_rvec(fio, molb->posres_xB, molb->nposres_xB);
 +    }
 +
 +}
 +
 +static t_block mtop_mols(gmx_mtop_t *mtop)
 +{
 +    int     mb, m, a, mol;
 +    t_block mols;
 +
 +    mols.nr = 0;
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        mols.nr += mtop->molblock[mb].nmol;
 +    }
 +    mols.nalloc_index = mols.nr + 1;
 +    snew(mols.index, mols.nalloc_index);
 +
 +    a             = 0;
 +    m             = 0;
 +    mols.index[m] = a;
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        for (mol = 0; mol < mtop->molblock[mb].nmol; mol++)
 +        {
 +            a += mtop->molblock[mb].natoms_mol;
 +            m++;
 +            mols.index[m] = a;
 +        }
 +    }
 +
 +    return mols;
 +}
 +
 +static void add_posres_molblock(gmx_mtop_t *mtop)
 +{
 +    t_ilist        *il, *ilfb;
 +    int             am, i, mol, a;
 +    gmx_bool        bFE;
 +    gmx_molblock_t *molb;
 +    t_iparams      *ip;
 +
 +    /* posres reference positions are stored in ip->posres (if present) and
 +       in ip->fbposres (if present). If normal and flat-bottomed posres are present,
 +       posres.pos0A are identical to fbposres.pos0. */
 +    il   = &mtop->moltype[0].ilist[F_POSRES];
 +    ilfb = &mtop->moltype[0].ilist[F_FBPOSRES];
 +    if (il->nr == 0 && ilfb->nr == 0)
 +    {
 +        return;
 +    }
 +    am  = 0;
 +    bFE = FALSE;
 +    for (i = 0; i < il->nr; i += 2)
 +    {
 +        ip = &mtop->ffparams.iparams[il->iatoms[i]];
 +        am = max(am, il->iatoms[i+1]);
 +        if (ip->posres.pos0B[XX] != ip->posres.pos0A[XX] ||
 +            ip->posres.pos0B[YY] != ip->posres.pos0A[YY] ||
 +            ip->posres.pos0B[ZZ] != ip->posres.pos0A[ZZ])
 +        {
 +            bFE = TRUE;
 +        }
 +    }
 +    /* This loop is required if we have only flat-bottomed posres:
 +       - set am
 +       - bFE == FALSE (no B-state for flat-bottomed posres) */
 +    if (il->nr == 0)
 +    {
 +        for (i = 0; i < ilfb->nr; i += 2)
 +        {
 +            ip = &mtop->ffparams.iparams[ilfb->iatoms[i]];
 +            am = max(am, ilfb->iatoms[i+1]);
 +        }
 +    }
 +    /* Make the posres coordinate block end at a molecule end */
 +    mol = 0;
 +    while (am >= mtop->mols.index[mol+1])
 +    {
 +        mol++;
 +    }
 +    molb             = &mtop->molblock[0];
 +    molb->nposres_xA = mtop->mols.index[mol+1];
 +    snew(molb->posres_xA, molb->nposres_xA);
 +    if (bFE)
 +    {
 +        molb->nposres_xB = molb->nposres_xA;
 +        snew(molb->posres_xB, molb->nposres_xB);
 +    }
 +    else
 +    {
 +        molb->nposres_xB = 0;
 +    }
 +    for (i = 0; i < il->nr; i += 2)
 +    {
 +        ip                     = &mtop->ffparams.iparams[il->iatoms[i]];
 +        a                      = il->iatoms[i+1];
 +        molb->posres_xA[a][XX] = ip->posres.pos0A[XX];
 +        molb->posres_xA[a][YY] = ip->posres.pos0A[YY];
 +        molb->posres_xA[a][ZZ] = ip->posres.pos0A[ZZ];
 +        if (bFE)
 +        {
 +            molb->posres_xB[a][XX] = ip->posres.pos0B[XX];
 +            molb->posres_xB[a][YY] = ip->posres.pos0B[YY];
 +            molb->posres_xB[a][ZZ] = ip->posres.pos0B[ZZ];
 +        }
 +    }
 +    if (il->nr == 0)
 +    {
 +        /* If only flat-bottomed posres are present, take reference pos from them.
 +           Here: bFE == FALSE      */
 +        for (i = 0; i < ilfb->nr; i += 2)
 +        {
 +            ip                     = &mtop->ffparams.iparams[ilfb->iatoms[i]];
 +            a                      = ilfb->iatoms[i+1];
 +            molb->posres_xA[a][XX] = ip->fbposres.pos0[XX];
 +            molb->posres_xA[a][YY] = ip->fbposres.pos0[YY];
 +            molb->posres_xA[a][ZZ] = ip->fbposres.pos0[ZZ];
 +        }
 +    }
 +}
 +
 +static void set_disres_npair(gmx_mtop_t *mtop)
 +{
 +    int        mt, i, npair;
 +    t_iparams *ip;
 +    t_ilist   *il;
 +    t_iatom   *a;
 +
 +    ip = mtop->ffparams.iparams;
 +
 +    for (mt = 0; mt < mtop->nmoltype; mt++)
 +    {
 +        il = &mtop->moltype[mt].ilist[F_DISRES];
 +        if (il->nr > 0)
 +        {
 +            a     = il->iatoms;
 +            npair = 0;
 +            for (i = 0; i < il->nr; i += 3)
 +            {
 +                npair++;
 +                if (i+3 == il->nr || ip[a[i]].disres.label != ip[a[i+3]].disres.label)
 +                {
 +                    ip[a[i]].disres.npair = npair;
 +                    npair                 = 0;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static void do_mtop(t_fileio *fio, gmx_mtop_t *mtop, gmx_bool bRead,
 +                    int file_version)
 +{
 +    int      mt, mb, i;
 +    t_blocka dumb;
 +
 +    if (bRead)
 +    {
 +        init_mtop(mtop);
 +    }
 +    do_symtab(fio, &(mtop->symtab), bRead);
 +    if (bRead && debug)
 +    {
 +        pr_symtab(debug, 0, "symtab", &mtop->symtab);
 +    }
 +
 +    do_symstr(fio, &(mtop->name), bRead, &(mtop->symtab));
 +
 +    if (file_version >= 57)
 +    {
 +        do_ffparams(fio, &mtop->ffparams, bRead, file_version);
 +
 +        gmx_fio_do_int(fio, mtop->nmoltype);
 +    }
 +    else
 +    {
 +        mtop->nmoltype = 1;
 +    }
 +    if (bRead)
 +    {
 +        snew(mtop->moltype, mtop->nmoltype);
 +        if (file_version < 57)
 +        {
 +            mtop->moltype[0].name = mtop->name;
 +        }
 +    }
 +    for (mt = 0; mt < mtop->nmoltype; mt++)
 +    {
 +        do_moltype(fio, &mtop->moltype[mt], bRead, &mtop->symtab, file_version,
 +                   &mtop->groups);
 +    }
 +
 +    if (file_version >= 57)
 +    {
 +        gmx_fio_do_int(fio, mtop->nmolblock);
 +    }
 +    else
 +    {
 +        mtop->nmolblock = 1;
 +    }
 +    if (bRead)
 +    {
 +        snew(mtop->molblock, mtop->nmolblock);
 +    }
 +    if (file_version >= 57)
 +    {
 +        for (mb = 0; mb < mtop->nmolblock; mb++)
 +        {
 +            do_molblock(fio, &mtop->molblock[mb], bRead, file_version);
 +        }
 +        gmx_fio_do_int(fio, mtop->natoms);
 +    }
 +    else
 +    {
 +        mtop->molblock[0].type       = 0;
 +        mtop->molblock[0].nmol       = 1;
 +        mtop->molblock[0].natoms_mol = mtop->moltype[0].atoms.nr;
 +        mtop->molblock[0].nposres_xA = 0;
 +        mtop->molblock[0].nposres_xB = 0;
 +    }
 +
 +    do_atomtypes (fio, &(mtop->atomtypes), bRead, &(mtop->symtab), file_version);
 +    if (bRead && debug)
 +    {
 +        pr_atomtypes(debug, 0, "atomtypes", &mtop->atomtypes, TRUE);
 +    }
 +
 +    if (file_version < 57)
 +    {
 +        /* Debug statements are inside do_idef */
 +        do_idef (fio, &mtop->ffparams, &mtop->moltype[0], bRead, file_version);
 +        mtop->natoms = mtop->moltype[0].atoms.nr;
 +    }
 +
 +    if (file_version >= 65)
 +    {
 +        do_cmap(fio, &mtop->ffparams.cmap_grid, bRead);
 +    }
 +    else
 +    {
 +        mtop->ffparams.cmap_grid.ngrid        = 0;
 +        mtop->ffparams.cmap_grid.grid_spacing = 0;
 +        mtop->ffparams.cmap_grid.cmapdata     = NULL;
 +    }
 +
 +    if (file_version >= 57)
 +    {
 +        do_groups(fio, &mtop->groups, bRead, &(mtop->symtab), file_version);
 +    }
 +
 +    if (file_version < 57)
 +    {
 +        do_block(fio, &mtop->moltype[0].cgs, bRead, file_version);
 +        if (bRead && gmx_debug_at)
 +        {
 +            pr_block(debug, 0, "cgs", &mtop->moltype[0].cgs, TRUE);
 +        }
 +        do_block(fio, &mtop->mols, bRead, file_version);
 +        /* Add the posres coordinates to the molblock */
 +        add_posres_molblock(mtop);
 +    }
 +    if (bRead)
 +    {
 +        if (file_version >= 57)
 +        {
 +            mtop->mols = mtop_mols(mtop);
 +        }
 +        if (gmx_debug_at)
 +        {
 +            pr_block(debug, 0, "mols", &mtop->mols, TRUE);
 +        }
 +    }
 +
 +    if (file_version < 51)
 +    {
 +        /* Here used to be the shake blocks */
 +        do_blocka(fio, &dumb, bRead, file_version);
 +        if (dumb.nr > 0)
 +        {
 +            sfree(dumb.index);
 +        }
 +        if (dumb.nra > 0)
 +        {
 +            sfree(dumb.a);
 +        }
 +    }
 +
 +    if (bRead)
 +    {
 +        close_symtab(&(mtop->symtab));
 +    }
 +}
 +
 +/* If TopOnlyOK is TRUE then we can read even future versions
 + * of tpx files, provided the file_generation hasn't changed.
 + * If it is FALSE, we need the inputrecord too, and bail out
 + * if the file is newer than the program.
 + *
 + * The version and generation if the topology (see top of this file)
 + * are returned in the two last arguments.
 + *
 + * If possible, we will read the inputrec even when TopOnlyOK is TRUE.
 + */
 +static void do_tpxheader(t_fileio *fio, gmx_bool bRead, t_tpxheader *tpx,
 +                         gmx_bool TopOnlyOK, int *file_version,
 +                         int *file_generation)
 +{
 +    char      buf[STRLEN];
 +    char      file_tag[STRLEN];
 +    gmx_bool  bDouble;
 +    int       precision;
 +    int       fver, fgen;
 +    int       idum = 0;
 +    real      rdum = 0;
 +
 +    gmx_fio_checktype(fio);
 +    gmx_fio_setdebug(fio, bDebugMode());
 +
 +    /* NEW! XDR tpb file */
 +    precision = sizeof(real);
 +    if (bRead)
 +    {
 +        gmx_fio_do_string(fio, buf);
 +        if (strncmp(buf, "VERSION", 7))
 +        {
 +            gmx_fatal(FARGS, "Can not read file %s,\n"
 +                      "             this file is from a Gromacs version which is older than 2.0\n"
 +                      "             Make a new one with grompp or use a gro or pdb file, if possible",
 +                      gmx_fio_getname(fio));
 +        }
 +        gmx_fio_do_int(fio, precision);
 +        bDouble = (precision == sizeof(double));
 +        if ((precision != sizeof(float)) && !bDouble)
 +        {
 +            gmx_fatal(FARGS, "Unknown precision in file %s: real is %d bytes "
 +                      "instead of %d or %d",
 +                      gmx_fio_getname(fio), precision, sizeof(float), sizeof(double));
 +        }
 +        gmx_fio_setprecision(fio, bDouble);
 +        fprintf(stderr, "Reading file %s, %s (%s precision)\n",
 +                gmx_fio_getname(fio), buf, bDouble ? "double" : "single");
 +    }
 +    else
 +    {
 +        gmx_fio_write_string(fio, GromacsVersion());
 +        bDouble = (precision == sizeof(double));
 +        gmx_fio_setprecision(fio, bDouble);
 +        gmx_fio_do_int(fio, precision);
 +        fver = tpx_version;
 +        sprintf(file_tag, "%s", tpx_tag);
 +        fgen = tpx_generation;
 +    }
 +
 +    /* Check versions! */
 +    gmx_fio_do_int(fio, fver);
 +
 +    /* This is for backward compatibility with development versions 77-79
 +     * where the tag was, mistakenly, placed before the generation,
 +     * which would cause a segv instead of a proper error message
 +     * when reading the topology only from tpx with <77 code.
 +     */
 +    if (fver >= 77 && fver <= 79)
 +    {
 +        gmx_fio_do_string(fio, file_tag);
 +    }
 +
 +    if (fver >= 26)
 +    {
 +        gmx_fio_do_int(fio, fgen);
 +    }
 +    else
 +    {
 +        fgen = 0;
 +    }
 +
 +    if (fver >= 81)
 +    {
 +        gmx_fio_do_string(fio, file_tag);
 +    }
 +    if (bRead)
 +    {
 +        if (fver < 77)
 +        {
 +            /* Versions before 77 don't have the tag, set it to release */
 +            sprintf(file_tag, "%s", TPX_TAG_RELEASE);
 +        }
 +
 +        if (strcmp(file_tag, tpx_tag) != 0)
 +        {
 +            fprintf(stderr, "Note: file tpx tag '%s', software tpx tag '%s'\n",
 +                    file_tag, tpx_tag);
 +
 +            /* We only support reading tpx files with the same tag as the code
 +             * or tpx files with the release tag and with lower version number.
 +             */
 +            if (!strcmp(file_tag, TPX_TAG_RELEASE) == 0 && fver < tpx_version)
 +            {
 +                gmx_fatal(FARGS, "tpx tag/version mismatch: reading tpx file (%s) version %d, tag '%s' with program for tpx version %d, tag '%s'",
 +                          gmx_fio_getname(fio), fver, file_tag,
 +                          tpx_version, tpx_tag);
 +            }
 +        }
 +    }
 +
 +    if (file_version != NULL)
 +    {
 +        *file_version = fver;
 +    }
 +    if (file_generation != NULL)
 +    {
 +        *file_generation = fgen;
 +    }
 +
 +
 +    if ((fver <= tpx_incompatible_version) ||
 +        ((fver > tpx_version) && !TopOnlyOK) ||
 +        (fgen > tpx_generation))
 +    {
 +        gmx_fatal(FARGS, "reading tpx file (%s) version %d with version %d program",
 +                  gmx_fio_getname(fio), fver, tpx_version);
 +    }
 +
 +    do_section(fio, eitemHEADER, bRead);
 +    gmx_fio_do_int(fio, tpx->natoms);
 +    if (fver >= 28)
 +    {
 +        gmx_fio_do_int(fio, tpx->ngtc);
 +    }
 +    else
 +    {
 +        tpx->ngtc = 0;
 +    }
 +    if (fver < 62)
 +    {
 +        gmx_fio_do_int(fio, idum);
 +        gmx_fio_do_real(fio, rdum);
 +    }
 +    /*a better decision will eventually (5.0 or later) need to be made
 +       on how to treat the alchemical state of the system, which can now
 +       vary through a simulation, and cannot be completely described
 +       though a single lambda variable, or even a single state
 +       index. Eventually, should probably be a vector. MRS*/
 +    if (fver >= 79)
 +    {
 +        gmx_fio_do_int(fio, tpx->fep_state);
 +    }
 +    gmx_fio_do_real(fio, tpx->lambda);
 +    gmx_fio_do_int(fio, tpx->bIr);
 +    gmx_fio_do_int(fio, tpx->bTop);
 +    gmx_fio_do_int(fio, tpx->bX);
 +    gmx_fio_do_int(fio, tpx->bV);
 +    gmx_fio_do_int(fio, tpx->bF);
 +    gmx_fio_do_int(fio, tpx->bBox);
 +
 +    if ((fgen > tpx_generation))
 +    {
 +        /* This can only happen if TopOnlyOK=TRUE */
 +        tpx->bIr = FALSE;
 +    }
 +}
 +
 +static int do_tpx(t_fileio *fio, gmx_bool bRead,
 +                  t_inputrec *ir, t_state *state, rvec *f, gmx_mtop_t *mtop,
 +                  gmx_bool bXVallocated)
 +{
 +    t_tpxheader     tpx;
 +    t_inputrec      dum_ir;
 +    gmx_mtop_t      dum_top;
 +    gmx_bool        TopOnlyOK, bDum = TRUE;
 +    int             file_version, file_generation;
 +    int             i;
 +    rvec           *xptr, *vptr;
 +    int             ePBC;
 +    gmx_bool        bPeriodicMols;
 +
 +    if (!bRead)
 +    {
 +        tpx.natoms    = state->natoms;
 +        tpx.ngtc      = state->ngtc; /* need to add nnhpres here? */
 +        tpx.fep_state = state->fep_state;
 +        tpx.lambda    = state->lambda[efptFEP];
 +        tpx.bIr       = (ir       != NULL);
 +        tpx.bTop      = (mtop     != NULL);
 +        tpx.bX        = (state->x != NULL);
 +        tpx.bV        = (state->v != NULL);
 +        tpx.bF        = (f        != NULL);
 +        tpx.bBox      = TRUE;
 +    }
 +
 +    TopOnlyOK = (ir == NULL);
 +
 +    do_tpxheader(fio, bRead, &tpx, TopOnlyOK, &file_version, &file_generation);
 +
 +    if (bRead)
 +    {
 +        state->flags  = 0;
 +        /* state->lambda = tpx.lambda;*/ /*remove this eventually? */
 +        /* The init_state calls initialize the Nose-Hoover xi integrals to zero */
 +        if (bXVallocated)
 +        {
 +            xptr = state->x;
 +            vptr = state->v;
 +            init_state(state, 0, tpx.ngtc, 0, 0, 0); /* nose-hoover chains */ /* eventually, need to add nnhpres here? */
 +            state->natoms = tpx.natoms;
 +            state->nalloc = tpx.natoms;
 +            state->x      = xptr;
 +            state->v      = vptr;
 +        }
 +        else
 +        {
 +            init_state(state, tpx.natoms, tpx.ngtc, 0, 0, 0); /* nose-hoover chains */
 +        }
 +    }
 +
 +#define do_test(fio, b, p) if (bRead && (p != NULL) && !b) gmx_fatal(FARGS, "No %s in %s",#p, gmx_fio_getname(fio))
 +
 +    do_test(fio, tpx.bBox, state->box);
 +    do_section(fio, eitemBOX, bRead);
 +    if (tpx.bBox)
 +    {
 +        gmx_fio_ndo_rvec(fio, state->box, DIM);
 +        if (file_version >= 51)
 +        {
 +            gmx_fio_ndo_rvec(fio, state->box_rel, DIM);
 +        }
 +        else
 +        {
 +            /* We initialize box_rel after reading the inputrec */
 +            clear_mat(state->box_rel);
 +        }
 +        if (file_version >= 28)
 +        {
 +            gmx_fio_ndo_rvec(fio, state->boxv, DIM);
 +            if (file_version < 56)
 +            {
 +                matrix mdum;
 +                gmx_fio_ndo_rvec(fio, mdum, DIM);
 +            }
 +        }
 +    }
 +
 +    if (state->ngtc > 0 && file_version >= 28)
 +    {
 +        real *dumv;
 +        /*ndo_double(state->nosehoover_xi,state->ngtc,bDum);*/
 +        /*ndo_double(state->nosehoover_vxi,state->ngtc,bDum);*/
 +        /*ndo_double(state->therm_integral,state->ngtc,bDum);*/
 +        snew(dumv, state->ngtc);
 +        if (file_version < 69)
 +        {
 +            bDum = gmx_fio_ndo_real(fio, dumv, state->ngtc);
 +        }
 +        /* These used to be the Berendsen tcoupl_lambda's */
 +        bDum = gmx_fio_ndo_real(fio, dumv, state->ngtc);
 +        sfree(dumv);
 +    }
 +
 +    /* Prior to tpx version 26, the inputrec was here.
 +     * I moved it to enable partial forward-compatibility
 +     * for analysis/viewer programs.
 +     */
 +    if (file_version < 26)
 +    {
 +        do_test(fio, tpx.bIr, ir);
 +        do_section(fio, eitemIR, bRead);
 +        if (tpx.bIr)
 +        {
 +            if (ir)
 +            {
 +                do_inputrec(fio, ir, bRead, file_version,
 +                            mtop ? &mtop->ffparams.fudgeQQ : NULL);
 +                if (bRead && debug)
 +                {
 +                    pr_inputrec(debug, 0, "inputrec", ir, FALSE);
 +                }
 +            }
 +            else
 +            {
 +                do_inputrec(fio, &dum_ir, bRead, file_version,
 +                            mtop ? &mtop->ffparams.fudgeQQ : NULL);
 +                if (bRead && debug)
 +                {
 +                    pr_inputrec(debug, 0, "inputrec", &dum_ir, FALSE);
 +                }
 +                done_inputrec(&dum_ir);
 +            }
 +
 +        }
 +    }
 +
 +    do_test(fio, tpx.bTop, mtop);
 +    do_section(fio, eitemTOP, bRead);
 +    if (tpx.bTop)
 +    {
 +        int mtop_file_version = file_version;
 +        /*allow reading of Gromacs 4.6 files*/
 +        if (mtop_file_version > 80 && mtop_file_version < 90)
 +        {
 +            mtop_file_version = 79;
 +        }
 +        if (mtop)
 +        {
 +            do_mtop(fio, mtop, bRead, mtop_file_version);
 +        }
 +        else
 +        {
 +            do_mtop(fio, &dum_top, bRead, mtop_file_version);
 +            done_mtop(&dum_top, TRUE);
 +        }
 +    }
 +    do_test(fio, tpx.bX, state->x);
 +    do_section(fio, eitemX, bRead);
 +    if (tpx.bX)
 +    {
 +        if (bRead)
 +        {
 +            state->flags |= (1<<estX);
 +        }
 +        gmx_fio_ndo_rvec(fio, state->x, state->natoms);
 +    }
 +
 +    do_test(fio, tpx.bV, state->v);
 +    do_section(fio, eitemV, bRead);
 +    if (tpx.bV)
 +    {
 +        if (bRead)
 +        {
 +            state->flags |= (1<<estV);
 +        }
 +        gmx_fio_ndo_rvec(fio, state->v, state->natoms);
 +    }
 +
 +    do_test(fio, tpx.bF, f);
 +    do_section(fio, eitemF, bRead);
 +    if (tpx.bF)
 +    {
 +        gmx_fio_ndo_rvec(fio, f, state->natoms);
 +    }
 +
 +    /* Starting with tpx version 26, we have the inputrec
 +     * at the end of the file, so we can ignore it
 +     * if the file is never than the software (but still the
 +     * same generation - see comments at the top of this file.
 +     *
 +     *
 +     */
 +    ePBC          = -1;
 +    bPeriodicMols = FALSE;
 +    if (file_version >= 26)
 +    {
 +        do_test(fio, tpx.bIr, ir);
 +        do_section(fio, eitemIR, bRead);
 +        if (tpx.bIr)
 +        {
 +            if (file_version >= 53)
 +            {
 +                /* Removed the pbc info from do_inputrec, since we always want it */
 +                if (!bRead)
 +                {
 +                    ePBC          = ir->ePBC;
 +                    bPeriodicMols = ir->bPeriodicMols;
 +                }
 +                gmx_fio_do_int(fio, ePBC);
 +                gmx_fio_do_gmx_bool(fio, bPeriodicMols);
 +            }
 +            if (file_generation <= tpx_generation && ir)
 +            {
 +                do_inputrec(fio, ir, bRead, file_version, mtop ? &mtop->ffparams.fudgeQQ : NULL);
 +                if (bRead && debug)
 +                {
 +                    pr_inputrec(debug, 0, "inputrec", ir, FALSE);
 +                }
 +                if (file_version < 51)
 +                {
 +                    set_box_rel(ir, state);
 +                }
 +                if (file_version < 53)
 +                {
 +                    ePBC          = ir->ePBC;
 +                    bPeriodicMols = ir->bPeriodicMols;
 +                }
 +            }
 +            if (bRead && ir && file_version >= 53)
 +            {
 +                /* We need to do this after do_inputrec, since that initializes ir */
 +                ir->ePBC          = ePBC;
 +                ir->bPeriodicMols = bPeriodicMols;
 +            }
 +        }
 +    }
 +
 +    if (bRead)
 +    {
 +        if (tpx.bIr && ir)
 +        {
 +            if (state->ngtc == 0)
 +            {
 +                /* Reading old version without tcoupl state data: set it */
 +                init_gtc_state(state, ir->opts.ngtc, 0, ir->opts.nhchainlength);
 +            }
 +            if (tpx.bTop && mtop)
 +            {
 +                if (file_version < 57)
 +                {
 +                    if (mtop->moltype[0].ilist[F_DISRES].nr > 0)
 +                    {
 +                        ir->eDisre = edrSimple;
 +                    }
 +                    else
 +                    {
 +                        ir->eDisre = edrNone;
 +                    }
 +                }
 +                set_disres_npair(mtop);
 +            }
 +        }
 +
 +        if (tpx.bTop && mtop)
 +        {
 +            gmx_mtop_finalize(mtop);
 +        }
 +
 +        if (file_version >= 57)
 +        {
 +            char *env;
 +            int   ienv;
 +            env = getenv("GMX_NOCHARGEGROUPS");
 +            if (env != NULL)
 +            {
 +                sscanf(env, "%d", &ienv);
 +                fprintf(stderr, "\nFound env.var. GMX_NOCHARGEGROUPS = %d\n",
 +                        ienv);
 +                if (ienv > 0)
 +                {
 +                    fprintf(stderr,
 +                            "Will make single atomic charge groups in non-solvent%s\n",
 +                            ienv > 1 ? " and solvent" : "");
 +                    gmx_mtop_make_atomic_charge_groups(mtop, ienv == 1);
 +                }
 +                fprintf(stderr, "\n");
 +            }
 +        }
 +    }
 +
 +    return ePBC;
 +}
 +
 +/************************************************************
 + *
 + *  The following routines are the exported ones
 + *
 + ************************************************************/
 +
 +t_fileio *open_tpx(const char *fn, const char *mode)
 +{
 +    return gmx_fio_open(fn, mode);
 +}
 +
 +void close_tpx(t_fileio *fio)
 +{
 +    gmx_fio_close(fio);
 +}
 +
 +void read_tpxheader(const char *fn, t_tpxheader *tpx, gmx_bool TopOnlyOK,
 +                    int *file_version, int *file_generation)
 +{
 +    t_fileio *fio;
 +
 +    fio = open_tpx(fn, "r");
 +    do_tpxheader(fio, TRUE, tpx, TopOnlyOK, file_version, file_generation);
 +    close_tpx(fio);
 +}
 +
 +void write_tpx_state(const char *fn,
 +                     t_inputrec *ir, t_state *state, gmx_mtop_t *mtop)
 +{
 +    t_fileio *fio;
 +
 +    fio = open_tpx(fn, "w");
 +    do_tpx(fio, FALSE, ir, state, NULL, mtop, FALSE);
 +    close_tpx(fio);
 +}
 +
 +void read_tpx_state(const char *fn,
 +                    t_inputrec *ir, t_state *state, rvec *f, gmx_mtop_t *mtop)
 +{
 +    t_fileio *fio;
 +
 +    fio = open_tpx(fn, "r");
 +    do_tpx(fio, TRUE, ir, state, f, mtop, FALSE);
 +    close_tpx(fio);
 +}
 +
 +int read_tpx(const char *fn,
 +             t_inputrec *ir, matrix box, int *natoms,
 +             rvec *x, rvec *v, rvec *f, gmx_mtop_t *mtop)
 +{
 +    t_fileio *fio;
 +    t_state   state;
 +    int       ePBC;
 +
 +    state.x = x;
 +    state.v = v;
 +    fio     = open_tpx(fn, "r");
 +    ePBC    = do_tpx(fio, TRUE, ir, &state, f, mtop, TRUE);
 +    close_tpx(fio);
 +    *natoms = state.natoms;
 +    if (box)
 +    {
 +        copy_mat(state.box, box);
 +    }
 +    state.x = NULL;
 +    state.v = NULL;
 +    done_state(&state);
 +
 +    return ePBC;
 +}
 +
 +int read_tpx_top(const char *fn,
 +                 t_inputrec *ir, matrix box, int *natoms,
 +                 rvec *x, rvec *v, rvec *f, t_topology *top)
 +{
 +    gmx_mtop_t  mtop;
 +    t_topology *ltop;
 +    int         ePBC;
 +
 +    ePBC = read_tpx(fn, ir, box, natoms, x, v, f, &mtop);
 +
 +    *top = gmx_mtop_t_to_t_topology(&mtop);
 +
 +    return ePBC;
 +}
 +
 +gmx_bool fn2bTPX(const char *file)
 +{
 +    switch (fn2ftp(file))
 +    {
 +        case efTPR:
 +        case efTPB:
 +        case efTPA:
 +            return TRUE;
 +        default:
 +            return FALSE;
 +    }
 +}
 +
 +gmx_bool read_tps_conf(const char *infile, char *title, t_topology *top, int *ePBC,
 +                       rvec **x, rvec **v, matrix box, gmx_bool bMass)
 +{
 +    t_tpxheader      header;
 +    int              natoms, i, version, generation;
 +    gmx_bool         bTop, bXNULL = FALSE;
 +    gmx_mtop_t      *mtop;
 +    t_topology      *topconv;
 +    gmx_atomprop_t   aps;
 +
 +    bTop  = fn2bTPX(infile);
 +    *ePBC = -1;
 +    if (bTop)
 +    {
 +        read_tpxheader(infile, &header, TRUE, &version, &generation);
 +        if (x)
 +        {
 +            snew(*x, header.natoms);
 +        }
 +        if (v)
 +        {
 +            snew(*v, header.natoms);
 +        }
 +        snew(mtop, 1);
 +        *ePBC = read_tpx(infile, NULL, box, &natoms,
 +                         (x == NULL) ? NULL : *x, (v == NULL) ? NULL : *v, NULL, mtop);
 +        *top = gmx_mtop_t_to_t_topology(mtop);
 +        sfree(mtop);
 +        strcpy(title, *top->name);
 +        tpx_make_chain_identifiers(&top->atoms, &top->mols);
 +    }
 +    else
 +    {
 +        get_stx_coordnum(infile, &natoms);
 +        init_t_atoms(&top->atoms, natoms, (fn2ftp(infile) == efPDB));
 +        if (x == NULL)
 +        {
 +            snew(x, 1);
 +            bXNULL = TRUE;
 +        }
 +        snew(*x, natoms);
 +        if (v)
 +        {
 +            snew(*v, natoms);
 +        }
 +        read_stx_conf(infile, title, &top->atoms, *x, (v == NULL) ? NULL : *v, ePBC, box);
 +        if (bXNULL)
 +        {
 +            sfree(*x);
 +            sfree(x);
 +        }
 +        if (bMass)
 +        {
 +            aps = gmx_atomprop_init();
 +            for (i = 0; (i < natoms); i++)
 +            {
 +                if (!gmx_atomprop_query(aps, epropMass,
 +                                        *top->atoms.resinfo[top->atoms.atom[i].resind].name,
 +                                        *top->atoms.atomname[i],
 +                                        &(top->atoms.atom[i].m)))
 +                {
 +                    if (debug)
 +                    {
 +                        fprintf(debug, "Can not find mass for atom %s %d %s, setting to 1\n",
 +                                *top->atoms.resinfo[top->atoms.atom[i].resind].name,
 +                                top->atoms.resinfo[top->atoms.atom[i].resind].nr,
 +                                *top->atoms.atomname[i]);
 +                    }
 +                }
 +            }
 +            gmx_atomprop_destroy(aps);
 +        }
 +        top->idef.ntypes = -1;
 +    }
 +
 +    return bTop;
 +}
index 323309366d8fa49f5ff71197bd9e896804d59313,0000000000000000000000000000000000000000..641e7dc5a160fc8eff8bb481936afe591ab1719a
mode 100644,000000..100644
--- /dev/null
@@@ -1,3867 -1,0 +1,3900 @@@
-             sprintf(warn_buf, "The sum of the two largest charge group radii (%f) is larger than rlist (%f)\n", max(rvdw1+rvdw2, rcoul1+rcoul2), ir->rlist);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <ctype.h>
 +#include <stdlib.h>
 +#include <limits.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "typedefs.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "macros.h"
 +#include "index.h"
 +#include "symtab.h"
 +#include "string2.h"
 +#include "readinp.h"
 +#include "warninp.h"
 +#include "readir.h"
 +#include "toputil.h"
 +#include "index.h"
 +#include "network.h"
 +#include "vec.h"
 +#include "pbc.h"
 +#include "mtop_util.h"
 +#include "chargegroup.h"
 +#include "inputrec.h"
 +
 +#define MAXPTR 254
 +#define NOGID  255
 +#define MAXLAMBDAS 1024
 +
 +/* Resource parameters
 + * Do not change any of these until you read the instruction
 + * in readinp.h. Some cpp's do not take spaces after the backslash
 + * (like the c-shell), which will give you a very weird compiler
 + * message.
 + */
 +
 +static char tcgrps[STRLEN], tau_t[STRLEN], ref_t[STRLEN],
 +            acc[STRLEN], accgrps[STRLEN], freeze[STRLEN], frdim[STRLEN],
 +            energy[STRLEN], user1[STRLEN], user2[STRLEN], vcm[STRLEN], xtc_grps[STRLEN],
 +            couple_moltype[STRLEN], orirefitgrp[STRLEN], egptable[STRLEN], egpexcl[STRLEN],
 +            wall_atomtype[STRLEN], wall_density[STRLEN], deform[STRLEN], QMMM[STRLEN];
 +static char   fep_lambda[efptNR][STRLEN];
 +static char   lambda_weights[STRLEN];
 +static char **pull_grp;
 +static char **rot_grp;
 +static char   anneal[STRLEN], anneal_npoints[STRLEN],
 +              anneal_time[STRLEN], anneal_temp[STRLEN];
 +static char   QMmethod[STRLEN], QMbasis[STRLEN], QMcharge[STRLEN], QMmult[STRLEN],
 +              bSH[STRLEN], CASorbitals[STRLEN], CASelectrons[STRLEN], SAon[STRLEN],
 +              SAoff[STRLEN], SAsteps[STRLEN], bTS[STRLEN], bOPT[STRLEN];
 +static char efield_x[STRLEN], efield_xt[STRLEN], efield_y[STRLEN],
 +            efield_yt[STRLEN], efield_z[STRLEN], efield_zt[STRLEN];
 +
 +enum {
 +    egrptpALL,         /* All particles have to be a member of a group.     */
 +    egrptpALL_GENREST, /* A rest group with name is generated for particles *
 +                        * that are not part of any group.                   */
 +    egrptpPART,        /* As egrptpALL_GENREST, but no name is generated    *
 +                        * for the rest group.                               */
 +    egrptpONE          /* Merge all selected groups into one group,         *
 +                        * make a rest group for the remaining particles.    */
 +};
 +
 +
 +void init_ir(t_inputrec *ir, t_gromppopts *opts)
 +{
 +    snew(opts->include, STRLEN);
 +    snew(opts->define, STRLEN);
 +    snew(ir->fepvals, 1);
 +    snew(ir->expandedvals, 1);
 +    snew(ir->simtempvals, 1);
 +}
 +
 +static void GetSimTemps(int ntemps, t_simtemp *simtemp, double *temperature_lambdas)
 +{
 +
 +    int i;
 +
 +    for (i = 0; i < ntemps; i++)
 +    {
 +        /* simple linear scaling -- allows more control */
 +        if (simtemp->eSimTempScale == esimtempLINEAR)
 +        {
 +            simtemp->temperatures[i] = simtemp->simtemp_low + (simtemp->simtemp_high-simtemp->simtemp_low)*temperature_lambdas[i];
 +        }
 +        else if (simtemp->eSimTempScale == esimtempGEOMETRIC)  /* should give roughly equal acceptance for constant heat capacity . . . */
 +        {
 +            simtemp->temperatures[i] = simtemp->simtemp_low * pow(simtemp->simtemp_high/simtemp->simtemp_low, (1.0*i)/(ntemps-1));
 +        }
 +        else if (simtemp->eSimTempScale == esimtempEXPONENTIAL)
 +        {
 +            simtemp->temperatures[i] = simtemp->simtemp_low + (simtemp->simtemp_high-simtemp->simtemp_low)*((exp(temperature_lambdas[i])-1)/(exp(1.0)-1));
 +        }
 +        else
 +        {
 +            char errorstr[128];
 +            sprintf(errorstr, "eSimTempScale=%d not defined", simtemp->eSimTempScale);
 +            gmx_fatal(FARGS, errorstr);
 +        }
 +    }
 +}
 +
 +
 +
 +static void _low_check(gmx_bool b, char *s, warninp_t wi)
 +{
 +    if (b)
 +    {
 +        warning_error(wi, s);
 +    }
 +}
 +
 +static void check_nst(const char *desc_nst, int nst,
 +                      const char *desc_p, int *p,
 +                      warninp_t wi)
 +{
 +    char buf[STRLEN];
 +
 +    if (*p > 0 && *p % nst != 0)
 +    {
 +        /* Round up to the next multiple of nst */
 +        *p = ((*p)/nst + 1)*nst;
 +        sprintf(buf, "%s should be a multiple of %s, changing %s to %d\n",
 +                desc_p, desc_nst, desc_p, *p);
 +        warning(wi, buf);
 +    }
 +}
 +
 +static gmx_bool ir_NVE(const t_inputrec *ir)
 +{
 +    return ((ir->eI == eiMD || EI_VV(ir->eI)) && ir->etc == etcNO);
 +}
 +
 +static int lcd(int n1, int n2)
 +{
 +    int d, i;
 +
 +    d = 1;
 +    for (i = 2; (i <= n1 && i <= n2); i++)
 +    {
 +        if (n1 % i == 0 && n2 % i == 0)
 +        {
 +            d = i;
 +        }
 +    }
 +
 +    return d;
 +}
 +
 +static void process_interaction_modifier(const t_inputrec *ir, int *eintmod)
 +{
 +    if (*eintmod == eintmodPOTSHIFT_VERLET)
 +    {
 +        if (ir->cutoff_scheme == ecutsVERLET)
 +        {
 +            *eintmod = eintmodPOTSHIFT;
 +        }
 +        else
 +        {
 +            *eintmod = eintmodNONE;
 +        }
 +    }
 +}
 +
 +void check_ir(const char *mdparin, t_inputrec *ir, t_gromppopts *opts,
 +              warninp_t wi)
 +/* Check internal consistency */
 +{
 +    /* Strange macro: first one fills the err_buf, and then one can check
 +     * the condition, which will print the message and increase the error
 +     * counter.
 +     */
 +#define CHECK(b) _low_check(b, err_buf, wi)
 +    char        err_buf[256], warn_buf[STRLEN];
 +    int         i, j;
 +    int         ns_type  = 0;
 +    real        dt_coupl = 0;
 +    real        dt_pcoupl;
 +    int         nstcmin;
 +    t_lambda   *fep    = ir->fepvals;
 +    t_expanded *expand = ir->expandedvals;
 +
 +    set_warning_line(wi, mdparin, -1);
 +
 +    /* BASIC CUT-OFF STUFF */
 +    if (ir->rcoulomb < 0)
 +    {
 +        warning_error(wi, "rcoulomb should be >= 0");
 +    }
 +    if (ir->rvdw < 0)
 +    {
 +        warning_error(wi, "rvdw should be >= 0");
 +    }
 +    if (ir->rlist < 0 &&
 +        !(ir->cutoff_scheme == ecutsVERLET && ir->verletbuf_drift > 0))
 +    {
 +        warning_error(wi, "rlist should be >= 0");
 +    }
 +
 +    process_interaction_modifier(ir, &ir->coulomb_modifier);
 +    process_interaction_modifier(ir, &ir->vdw_modifier);
 +
 +    if (ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* BASIC CUT-OFF STUFF */
 +        if (ir->rlist == 0 ||
 +            !((EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype) && ir->rcoulomb > ir->rlist) ||
 +              (EVDW_MIGHT_BE_ZERO_AT_CUTOFF(ir->vdwtype)    && ir->rvdw     > ir->rlist)))
 +        {
 +            /* No switched potential and/or no twin-range:
 +             * we can set the long-range cut-off to the maximum of the other cut-offs.
 +             */
 +            ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
 +        }
 +        else if (ir->rlistlong < 0)
 +        {
 +            ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
 +            sprintf(warn_buf, "rlistlong was not set, setting it to %g (no buffer)",
 +                    ir->rlistlong);
 +            warning(wi, warn_buf);
 +        }
 +        if (ir->rlistlong == 0 && ir->ePBC != epbcNONE)
 +        {
 +            warning_error(wi, "Can not have an infinite cut-off with PBC");
 +        }
 +        if (ir->rlistlong > 0 && (ir->rlist == 0 || ir->rlistlong < ir->rlist))
 +        {
 +            warning_error(wi, "rlistlong can not be shorter than rlist");
 +        }
 +        if (IR_TWINRANGE(*ir) && ir->nstlist <= 0)
 +        {
 +            warning_error(wi, "Can not have nstlist<=0 with twin-range interactions");
 +        }
 +    }
 +
 +    if (ir->rlistlong == ir->rlist)
 +    {
 +        ir->nstcalclr = 0;
 +    }
 +    else if (ir->rlistlong > ir->rlist && ir->nstcalclr == 0)
 +    {
 +        warning_error(wi, "With different cutoffs for electrostatics and VdW, nstcalclr must be -1 or a positive number");
 +    }
 +
 +    if (ir->cutoff_scheme == ecutsVERLET)
 +    {
 +        real rc_max;
 +
 +        /* Normal Verlet type neighbor-list, currently only limited feature support */
 +        if (inputrec2nboundeddim(ir) < 3)
 +        {
 +            warning_error(wi, "With Verlet lists only full pbc or pbc=xy with walls is supported");
 +        }
 +        if (ir->rcoulomb != ir->rvdw)
 +        {
 +            warning_error(wi, "With Verlet lists rcoulomb!=rvdw is not supported");
 +        }
 +        if (ir->vdwtype != evdwCUT)
 +        {
 +            warning_error(wi, "With Verlet lists only cut-off LJ interactions are supported");
 +        }
 +        if (!(ir->coulombtype == eelCUT ||
 +              (EEL_RF(ir->coulombtype) && ir->coulombtype != eelRF_NEC) ||
 +              EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD))
 +        {
 +            warning_error(wi, "With Verlet lists only cut-off, reaction-field, PME and Ewald electrostatics are supported");
 +        }
 +
 +        if (ir->nstlist <= 0)
 +        {
 +            warning_error(wi, "With Verlet lists nstlist should be larger than 0");
 +        }
 +
 +        if (ir->nstlist < 10)
 +        {
 +            warning_note(wi, "With Verlet lists the optimal nstlist is >= 10, with GPUs >= 20. Note that with the Verlet scheme, nstlist has no effect on the accuracy of your simulation.");
 +        }
 +
 +        rc_max = max(ir->rvdw, ir->rcoulomb);
 +
 +        if (ir->verletbuf_drift <= 0)
 +        {
 +            if (ir->verletbuf_drift == 0)
 +            {
 +                warning_error(wi, "Can not have an energy drift of exactly 0");
 +            }
 +
 +            if (ir->rlist < rc_max)
 +            {
 +                warning_error(wi, "With verlet lists rlist can not be smaller than rvdw or rcoulomb");
 +            }
 +
 +            if (ir->rlist == rc_max && ir->nstlist > 1)
 +            {
 +                warning_note(wi, "rlist is equal to rvdw and/or rcoulomb: there is no explicit Verlet buffer. The cluster pair list does have a buffering effect, but choosing a larger rlist might be necessary for good energy conservation.");
 +            }
 +        }
 +        else
 +        {
 +            if (ir->rlist > rc_max)
 +            {
 +                warning_note(wi, "You have set rlist larger than the interaction cut-off, but you also have verlet-buffer-drift > 0. Will set rlist using verlet-buffer-drift.");
 +            }
 +
 +            if (ir->nstlist == 1)
 +            {
 +                /* No buffer required */
 +                ir->rlist = rc_max;
 +            }
 +            else
 +            {
 +                if (EI_DYNAMICS(ir->eI))
 +                {
 +                    if (EI_MD(ir->eI) && ir->etc == etcNO)
 +                    {
 +                        warning_error(wi, "Temperature coupling is required for calculating rlist using the energy drift with verlet-buffer-drift > 0. Either use temperature coupling or set rlist yourself together with verlet-buffer-drift = -1.");
 +                    }
 +
 +                    if (inputrec2nboundeddim(ir) < 3)
 +                    {
 +                        warning_error(wi, "The box volume is required for calculating rlist from the energy drift with verlet-buffer-drift > 0. You are using at least one unbounded dimension, so no volume can be computed. Either use a finite box, or set rlist yourself together with verlet-buffer-drift = -1.");
 +                    }
 +                    /* Set rlist temporarily so we can continue processing */
 +                    ir->rlist = rc_max;
 +                }
 +                else
 +                {
 +                    /* Set the buffer to 5% of the cut-off */
 +                    ir->rlist = 1.05*rc_max;
 +                }
 +            }
 +        }
 +
 +        /* No twin-range calculations with Verlet lists */
 +        ir->rlistlong = ir->rlist;
 +    }
 +
 +    if (ir->nstcalclr == -1)
 +    {
 +        /* if rlist=rlistlong, this will later be changed to nstcalclr=0 */
 +        ir->nstcalclr = ir->nstlist;
 +    }
 +    else if (ir->nstcalclr > 0)
 +    {
 +        if (ir->nstlist > 0 && (ir->nstlist % ir->nstcalclr != 0))
 +        {
 +            warning_error(wi, "nstlist must be evenly divisible by nstcalclr. Use nstcalclr = -1 to automatically follow nstlist");
 +        }
 +    }
 +    else if (ir->nstcalclr < -1)
 +    {
 +        warning_error(wi, "nstcalclr must be a positive number (divisor of nstcalclr), or -1 to follow nstlist.");
 +    }
 +
 +    if (EEL_PME(ir->coulombtype) && ir->rcoulomb > ir->rvdw && ir->nstcalclr > 1)
 +    {
 +        warning_error(wi, "When used with PME, the long-range component of twin-range interactions must be updated every step (nstcalclr)");
 +    }
 +
 +    /* GENERAL INTEGRATOR STUFF */
 +    if (!(ir->eI == eiMD || EI_VV(ir->eI)))
 +    {
 +        ir->etc = etcNO;
 +    }
 +    if (ir->eI == eiVVAK)
 +    {
 +        sprintf(warn_buf, "Integrator method %s is implemented primarily for validation purposes; for molecular dynamics, you should probably be using %s or %s", ei_names[eiVVAK], ei_names[eiMD], ei_names[eiVV]);
 +        warning_note(wi, warn_buf);
 +    }
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        ir->epc = epcNO;
 +    }
 +    if (EI_DYNAMICS(ir->eI))
 +    {
 +        if (ir->nstcalcenergy < 0)
 +        {
 +            ir->nstcalcenergy = ir_optimal_nstcalcenergy(ir);
 +            if (ir->nstenergy != 0 && ir->nstenergy < ir->nstcalcenergy)
 +            {
 +                /* nstcalcenergy larger than nstener does not make sense.
 +                 * We ideally want nstcalcenergy=nstener.
 +                 */
 +                if (ir->nstlist > 0)
 +                {
 +                    ir->nstcalcenergy = lcd(ir->nstenergy, ir->nstlist);
 +                }
 +                else
 +                {
 +                    ir->nstcalcenergy = ir->nstenergy;
 +                }
 +            }
 +        }
 +        else if ( (ir->nstenergy > 0 && ir->nstcalcenergy > ir->nstenergy) ||
 +                  (ir->efep != efepNO && ir->fepvals->nstdhdl > 0 &&
 +                   (ir->nstcalcenergy > ir->fepvals->nstdhdl) ) )
 +
 +        {
 +            const char *nsten    = "nstenergy";
 +            const char *nstdh    = "nstdhdl";
 +            const char *min_name = nsten;
 +            int         min_nst  = ir->nstenergy;
 +
 +            /* find the smallest of ( nstenergy, nstdhdl ) */
 +            if (ir->efep != efepNO && ir->fepvals->nstdhdl > 0 &&
 +                (ir->fepvals->nstdhdl < ir->nstenergy) )
 +            {
 +                min_nst  = ir->fepvals->nstdhdl;
 +                min_name = nstdh;
 +            }
 +            /* If the user sets nstenergy small, we should respect that */
 +            sprintf(warn_buf,
 +                    "Setting nstcalcenergy (%d) equal to %s (%d)",
 +                    ir->nstcalcenergy, min_name, min_nst);
 +            warning_note(wi, warn_buf);
 +            ir->nstcalcenergy = min_nst;
 +        }
 +
 +        if (ir->epc != epcNO)
 +        {
 +            if (ir->nstpcouple < 0)
 +            {
 +                ir->nstpcouple = ir_optimal_nstpcouple(ir);
 +            }
 +        }
 +        if (IR_TWINRANGE(*ir))
 +        {
 +            check_nst("nstlist", ir->nstlist,
 +                      "nstcalcenergy", &ir->nstcalcenergy, wi);
 +            if (ir->epc != epcNO)
 +            {
 +                check_nst("nstlist", ir->nstlist,
 +                          "nstpcouple", &ir->nstpcouple, wi);
 +            }
 +        }
 +
 +        if (ir->nstcalcenergy > 0)
 +        {
 +            if (ir->efep != efepNO)
 +            {
 +                /* nstdhdl should be a multiple of nstcalcenergy */
 +                check_nst("nstcalcenergy", ir->nstcalcenergy,
 +                          "nstdhdl", &ir->fepvals->nstdhdl, wi);
 +                /* nstexpanded should be a multiple of nstcalcenergy */
 +                check_nst("nstcalcenergy", ir->nstcalcenergy,
 +                          "nstexpanded", &ir->expandedvals->nstexpanded, wi);
 +            }
 +            /* for storing exact averages nstenergy should be
 +             * a multiple of nstcalcenergy
 +             */
 +            check_nst("nstcalcenergy", ir->nstcalcenergy,
 +                      "nstenergy", &ir->nstenergy, wi);
 +        }
 +    }
 +
 +    /* LD STUFF */
 +    if ((EI_SD(ir->eI) || ir->eI == eiBD) &&
 +        ir->bContinuation && ir->ld_seed != -1)
 +    {
 +        warning_note(wi, "You are doing a continuation with SD or BD, make sure that ld_seed is different from the previous run (using ld_seed=-1 will ensure this)");
 +    }
 +
 +    /* TPI STUFF */
 +    if (EI_TPI(ir->eI))
 +    {
 +        sprintf(err_buf, "TPI only works with pbc = %s", epbc_names[epbcXYZ]);
 +        CHECK(ir->ePBC != epbcXYZ);
 +        sprintf(err_buf, "TPI only works with ns = %s", ens_names[ensGRID]);
 +        CHECK(ir->ns_type != ensGRID);
 +        sprintf(err_buf, "with TPI nstlist should be larger than zero");
 +        CHECK(ir->nstlist <= 0);
 +        sprintf(err_buf, "TPI does not work with full electrostatics other than PME");
 +        CHECK(EEL_FULL(ir->coulombtype) && !EEL_PME(ir->coulombtype));
 +    }
 +
 +    /* SHAKE / LINCS */
 +    if ( (opts->nshake > 0) && (opts->bMorse) )
 +    {
 +        sprintf(warn_buf,
 +                "Using morse bond-potentials while constraining bonds is useless");
 +        warning(wi, warn_buf);
 +    }
 +
 +    if ((EI_SD(ir->eI) || ir->eI == eiBD) &&
 +        ir->bContinuation && ir->ld_seed != -1)
 +    {
 +        warning_note(wi, "You are doing a continuation with SD or BD, make sure that ld_seed is different from the previous run (using ld_seed=-1 will ensure this)");
 +    }
 +    /* verify simulated tempering options */
 +
 +    if (ir->bSimTemp)
 +    {
 +        gmx_bool bAllTempZero = TRUE;
 +        for (i = 0; i < fep->n_lambda; i++)
 +        {
 +            sprintf(err_buf, "Entry %d for %s must be between 0 and 1, instead is %g", i, efpt_names[efptTEMPERATURE], fep->all_lambda[efptTEMPERATURE][i]);
 +            CHECK((fep->all_lambda[efptTEMPERATURE][i] < 0) || (fep->all_lambda[efptTEMPERATURE][i] > 1));
 +            if (fep->all_lambda[efptTEMPERATURE][i] > 0)
 +            {
 +                bAllTempZero = FALSE;
 +            }
 +        }
 +        sprintf(err_buf, "if simulated tempering is on, temperature-lambdas may not be all zero");
 +        CHECK(bAllTempZero == TRUE);
 +
 +        sprintf(err_buf, "Simulated tempering is currently only compatible with md-vv");
 +        CHECK(ir->eI != eiVV);
 +
 +        /* check compatability of the temperature coupling with simulated tempering */
 +
 +        if (ir->etc == etcNOSEHOOVER)
 +        {
 +            sprintf(warn_buf, "Nose-Hoover based temperature control such as [%s] my not be entirelyconsistent with simulated tempering", etcoupl_names[ir->etc]);
 +            warning_note(wi, warn_buf);
 +        }
 +
 +        /* check that the temperatures make sense */
 +
 +        sprintf(err_buf, "Higher simulated tempering temperature (%g) must be >= than the simulated tempering lower temperature (%g)", ir->simtempvals->simtemp_high, ir->simtempvals->simtemp_low);
 +        CHECK(ir->simtempvals->simtemp_high <= ir->simtempvals->simtemp_low);
 +
 +        sprintf(err_buf, "Higher simulated tempering temperature (%g) must be >= zero", ir->simtempvals->simtemp_high);
 +        CHECK(ir->simtempvals->simtemp_high <= 0);
 +
 +        sprintf(err_buf, "Lower simulated tempering temperature (%g) must be >= zero", ir->simtempvals->simtemp_low);
 +        CHECK(ir->simtempvals->simtemp_low <= 0);
 +    }
 +
 +    /* verify free energy options */
 +
 +    if (ir->efep != efepNO)
 +    {
 +        fep = ir->fepvals;
 +        sprintf(err_buf, "The soft-core power is %d and can only be 1 or 2",
 +                fep->sc_power);
 +        CHECK(fep->sc_alpha != 0 && fep->sc_power != 1 && fep->sc_power != 2);
 +
 +        sprintf(err_buf, "The soft-core sc-r-power is %d and can only be 6 or 48",
 +                (int)fep->sc_r_power);
 +        CHECK(fep->sc_alpha != 0 && fep->sc_r_power != 6.0 && fep->sc_r_power != 48.0);
 +
 +        sprintf(err_buf, "Can't use postive delta-lambda (%g) if initial state/lambda does not start at zero", fep->delta_lambda);
 +        CHECK(fep->delta_lambda > 0 && ((fep->init_fep_state > 0) ||  (fep->init_lambda > 0)));
 +
 +        sprintf(err_buf, "Can't use postive delta-lambda (%g) with expanded ensemble simulations", fep->delta_lambda);
 +        CHECK(fep->delta_lambda > 0 && (ir->efep == efepEXPANDED));
 +
 +        sprintf(err_buf, "Free-energy not implemented for Ewald");
 +        CHECK(ir->coulombtype == eelEWALD);
 +
 +        /* check validty of lambda inputs */
 +        if (fep->n_lambda == 0)
 +        {
 +            /* Clear output in case of no states:*/
 +            sprintf(err_buf, "init-lambda-state set to %d: no lambda states are defined.", fep->init_fep_state);
 +            CHECK((fep->init_fep_state >= 0) && (fep->n_lambda == 0));
 +        }
 +        else
 +        {
 +            sprintf(err_buf, "initial thermodynamic state %d does not exist, only goes to %d", fep->init_fep_state, fep->n_lambda-1);
 +            CHECK((fep->init_fep_state >= fep->n_lambda));
 +        }
 +
 +        sprintf(err_buf, "Lambda state must be set, either with init-lambda-state or with init-lambda");
 +        CHECK((fep->init_fep_state < 0) && (fep->init_lambda < 0));
 +
 +        sprintf(err_buf, "init-lambda=%g while init-lambda-state=%d. Lambda state must be set either with init-lambda-state or with init-lambda, but not both",
 +                fep->init_lambda, fep->init_fep_state);
 +        CHECK((fep->init_fep_state >= 0) && (fep->init_lambda >= 0));
 +
 +
 +
 +        if ((fep->init_lambda >= 0) && (fep->delta_lambda == 0))
 +        {
 +            int n_lambda_terms;
 +            n_lambda_terms = 0;
 +            for (i = 0; i < efptNR; i++)
 +            {
 +                if (fep->separate_dvdl[i])
 +                {
 +                    n_lambda_terms++;
 +                }
 +            }
 +            if (n_lambda_terms > 1)
 +            {
 +                sprintf(warn_buf, "If lambda vector states (fep-lambdas, coul-lambdas etc.) are set, don't use init-lambda to set lambda state (except for slow growth). Use init-lambda-state instead.");
 +                warning(wi, warn_buf);
 +            }
 +
 +            if (n_lambda_terms < 2 && fep->n_lambda > 0)
 +            {
 +                warning_note(wi,
 +                             "init-lambda is deprecated for setting lambda state (except for slow growth). Use init-lambda-state instead.");
 +            }
 +        }
 +
 +        for (j = 0; j < efptNR; j++)
 +        {
 +            for (i = 0; i < fep->n_lambda; i++)
 +            {
 +                sprintf(err_buf, "Entry %d for %s must be between 0 and 1, instead is %g", i, efpt_names[j], fep->all_lambda[j][i]);
 +                CHECK((fep->all_lambda[j][i] < 0) || (fep->all_lambda[j][i] > 1));
 +            }
 +        }
 +
 +        if ((fep->sc_alpha > 0) && (!fep->bScCoul))
 +        {
 +            for (i = 0; i < fep->n_lambda; i++)
 +            {
 +                sprintf(err_buf, "For state %d, vdw-lambdas (%f) is changing with vdw softcore, while coul-lambdas (%f) is nonzero without coulomb softcore: this will lead to crashes, and is not supported.", i, fep->all_lambda[efptVDW][i],
 +                        fep->all_lambda[efptCOUL][i]);
 +                CHECK((fep->sc_alpha > 0) &&
 +                      (((fep->all_lambda[efptCOUL][i] > 0.0) &&
 +                        (fep->all_lambda[efptCOUL][i] < 1.0)) &&
 +                       ((fep->all_lambda[efptVDW][i] > 0.0) &&
 +                        (fep->all_lambda[efptVDW][i] < 1.0))));
 +            }
 +        }
 +
 +        if ((fep->bScCoul) && (EEL_PME(ir->coulombtype)))
 +        {
 +            real sigma, lambda, r_sc;
 +
 +            sigma  = 0.34;
 +            /* Maximum estimate for A and B charges equal with lambda power 1 */
 +            lambda = 0.5;
 +            r_sc   = pow(lambda*fep->sc_alpha*pow(sigma/ir->rcoulomb, fep->sc_r_power) + 1.0, 1.0/fep->sc_r_power);
 +            sprintf(warn_buf, "With PME there is a minor soft core effect present at the cut-off, proportional to (LJsigma/rcoulomb)^%g. This could have a minor effect on energy conservation, but usually other effects dominate. With a common sigma value of %g nm the fraction of the particle-particle potential at the cut-off at lambda=%g is around %.1e, while ewald-rtol is %.1e.",
 +                    fep->sc_r_power,
 +                    sigma, lambda, r_sc - 1.0, ir->ewald_rtol);
 +            warning_note(wi, warn_buf);
 +        }
 +
 +        /*  Free Energy Checks -- In an ideal world, slow growth and FEP would
 +            be treated differently, but that's the next step */
 +
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            for (j = 0; j < fep->n_lambda; j++)
 +            {
 +                sprintf(err_buf, "%s[%d] must be between 0 and 1", efpt_names[i], j);
 +                CHECK((fep->all_lambda[i][j] < 0) || (fep->all_lambda[i][j] > 1));
 +            }
 +        }
 +    }
 +
 +    if ((ir->bSimTemp) || (ir->efep == efepEXPANDED))
 +    {
 +        fep    = ir->fepvals;
 +        expand = ir->expandedvals;
 +
 +        /* checking equilibration of weights inputs for validity */
 +
 +        sprintf(err_buf, "weight-equil-number-all-lambda (%d) is ignored if lmc-weights-equil is not equal to %s",
 +                expand->equil_n_at_lam, elmceq_names[elmceqNUMATLAM]);
 +        CHECK((expand->equil_n_at_lam > 0) && (expand->elmceq != elmceqNUMATLAM));
 +
 +        sprintf(err_buf, "weight-equil-number-samples (%d) is ignored if lmc-weights-equil is not equal to %s",
 +                expand->equil_samples, elmceq_names[elmceqSAMPLES]);
 +        CHECK((expand->equil_samples > 0) && (expand->elmceq != elmceqSAMPLES));
 +
 +        sprintf(err_buf, "weight-equil-number-steps (%d) is ignored if lmc-weights-equil is not equal to %s",
 +                expand->equil_steps, elmceq_names[elmceqSTEPS]);
 +        CHECK((expand->equil_steps > 0) && (expand->elmceq != elmceqSTEPS));
 +
 +        sprintf(err_buf, "weight-equil-wl-delta (%d) is ignored if lmc-weights-equil is not equal to %s",
 +                expand->equil_samples, elmceq_names[elmceqWLDELTA]);
 +        CHECK((expand->equil_wl_delta > 0) && (expand->elmceq != elmceqWLDELTA));
 +
 +        sprintf(err_buf, "weight-equil-count-ratio (%f) is ignored if lmc-weights-equil is not equal to %s",
 +                expand->equil_ratio, elmceq_names[elmceqRATIO]);
 +        CHECK((expand->equil_ratio > 0) && (expand->elmceq != elmceqRATIO));
 +
 +        sprintf(err_buf, "weight-equil-number-all-lambda (%d) must be a positive integer if lmc-weights-equil=%s",
 +                expand->equil_n_at_lam, elmceq_names[elmceqNUMATLAM]);
 +        CHECK((expand->equil_n_at_lam <= 0) && (expand->elmceq == elmceqNUMATLAM));
 +
 +        sprintf(err_buf, "weight-equil-number-samples (%d) must be a positive integer if lmc-weights-equil=%s",
 +                expand->equil_samples, elmceq_names[elmceqSAMPLES]);
 +        CHECK((expand->equil_samples <= 0) && (expand->elmceq == elmceqSAMPLES));
 +
 +        sprintf(err_buf, "weight-equil-number-steps (%d) must be a positive integer if lmc-weights-equil=%s",
 +                expand->equil_steps, elmceq_names[elmceqSTEPS]);
 +        CHECK((expand->equil_steps <= 0) && (expand->elmceq == elmceqSTEPS));
 +
 +        sprintf(err_buf, "weight-equil-wl-delta (%f) must be > 0 if lmc-weights-equil=%s",
 +                expand->equil_wl_delta, elmceq_names[elmceqWLDELTA]);
 +        CHECK((expand->equil_wl_delta <= 0) && (expand->elmceq == elmceqWLDELTA));
 +
 +        sprintf(err_buf, "weight-equil-count-ratio (%f) must be > 0 if lmc-weights-equil=%s",
 +                expand->equil_ratio, elmceq_names[elmceqRATIO]);
 +        CHECK((expand->equil_ratio <= 0) && (expand->elmceq == elmceqRATIO));
 +
 +        sprintf(err_buf, "lmc-weights-equil=%s only possible when lmc-stats = %s or lmc-stats %s",
 +                elmceq_names[elmceqWLDELTA], elamstats_names[elamstatsWL], elamstats_names[elamstatsWWL]);
 +        CHECK((expand->elmceq == elmceqWLDELTA) && (!EWL(expand->elamstats)));
 +
 +        sprintf(err_buf, "lmc-repeats (%d) must be greater than 0", expand->lmc_repeats);
 +        CHECK((expand->lmc_repeats <= 0));
 +        sprintf(err_buf, "minimum-var-min (%d) must be greater than 0", expand->minvarmin);
 +        CHECK((expand->minvarmin <= 0));
 +        sprintf(err_buf, "weight-c-range (%d) must be greater or equal to 0", expand->c_range);
 +        CHECK((expand->c_range < 0));
 +        sprintf(err_buf, "init-lambda-state (%d) must be zero if lmc-forced-nstart (%d)> 0 and lmc-move != 'no'",
 +                fep->init_fep_state, expand->lmc_forced_nstart);
 +        CHECK((fep->init_fep_state != 0) && (expand->lmc_forced_nstart > 0) && (expand->elmcmove != elmcmoveNO));
 +        sprintf(err_buf, "lmc-forced-nstart (%d) must not be negative", expand->lmc_forced_nstart);
 +        CHECK((expand->lmc_forced_nstart < 0));
 +        sprintf(err_buf, "init-lambda-state (%d) must be in the interval [0,number of lambdas)", fep->init_fep_state);
 +        CHECK((fep->init_fep_state < 0) || (fep->init_fep_state >= fep->n_lambda));
 +
 +        sprintf(err_buf, "init-wl-delta (%f) must be greater than or equal to 0", expand->init_wl_delta);
 +        CHECK((expand->init_wl_delta < 0));
 +        sprintf(err_buf, "wl-ratio (%f) must be between 0 and 1", expand->wl_ratio);
 +        CHECK((expand->wl_ratio <= 0) || (expand->wl_ratio >= 1));
 +        sprintf(err_buf, "wl-scale (%f) must be between 0 and 1", expand->wl_scale);
 +        CHECK((expand->wl_scale <= 0) || (expand->wl_scale >= 1));
 +
 +        /* if there is no temperature control, we need to specify an MC temperature */
 +        sprintf(err_buf, "If there is no temperature control, and lmc-mcmove!= 'no',mc_temperature must be set to a positive number");
 +        if (expand->nstTij > 0)
 +        {
 +            sprintf(err_buf, "nst-transition-matrix (%d) must be an integer multiple of nstlog (%d)",
 +                    expand->nstTij, ir->nstlog);
 +            CHECK((mod(expand->nstTij, ir->nstlog) != 0));
 +        }
 +    }
 +
 +    /* PBC/WALLS */
 +    sprintf(err_buf, "walls only work with pbc=%s", epbc_names[epbcXY]);
 +    CHECK(ir->nwall && ir->ePBC != epbcXY);
 +
 +    /* VACUUM STUFF */
 +    if (ir->ePBC != epbcXYZ && ir->nwall != 2)
 +    {
 +        if (ir->ePBC == epbcNONE)
 +        {
 +            if (ir->epc != epcNO)
 +            {
 +                warning(wi, "Turning off pressure coupling for vacuum system");
 +                ir->epc = epcNO;
 +            }
 +        }
 +        else
 +        {
 +            sprintf(err_buf, "Can not have pressure coupling with pbc=%s",
 +                    epbc_names[ir->ePBC]);
 +            CHECK(ir->epc != epcNO);
 +        }
 +        sprintf(err_buf, "Can not have Ewald with pbc=%s", epbc_names[ir->ePBC]);
 +        CHECK(EEL_FULL(ir->coulombtype));
 +
 +        sprintf(err_buf, "Can not have dispersion correction with pbc=%s",
 +                epbc_names[ir->ePBC]);
 +        CHECK(ir->eDispCorr != edispcNO);
 +    }
 +
 +    if (ir->rlist == 0.0)
 +    {
 +        sprintf(err_buf, "can only have neighborlist cut-off zero (=infinite)\n"
 +                "with coulombtype = %s or coulombtype = %s\n"
 +                "without periodic boundary conditions (pbc = %s) and\n"
 +                "rcoulomb and rvdw set to zero",
 +                eel_names[eelCUT], eel_names[eelUSER], epbc_names[epbcNONE]);
 +        CHECK(((ir->coulombtype != eelCUT) && (ir->coulombtype != eelUSER)) ||
 +              (ir->ePBC     != epbcNONE) ||
 +              (ir->rcoulomb != 0.0)      || (ir->rvdw != 0.0));
 +
 +        if (ir->nstlist < 0)
 +        {
 +            warning_error(wi, "Can not have heuristic neighborlist updates without cut-off");
 +        }
 +        if (ir->nstlist > 0)
 +        {
 +            warning_note(wi, "Simulating without cut-offs is usually (slightly) faster with nstlist=0, nstype=simple and particle decomposition");
 +        }
 +    }
 +
 +    /* COMM STUFF */
 +    if (ir->nstcomm == 0)
 +    {
 +        ir->comm_mode = ecmNO;
 +    }
 +    if (ir->comm_mode != ecmNO)
 +    {
 +        if (ir->nstcomm < 0)
 +        {
 +            warning(wi, "If you want to remove the rotation around the center of mass, you should set comm_mode = Angular instead of setting nstcomm < 0. nstcomm is modified to its absolute value");
 +            ir->nstcomm = abs(ir->nstcomm);
 +        }
 +
 +        if (ir->nstcalcenergy > 0 && ir->nstcomm < ir->nstcalcenergy)
 +        {
 +            warning_note(wi, "nstcomm < nstcalcenergy defeats the purpose of nstcalcenergy, setting nstcomm to nstcalcenergy");
 +            ir->nstcomm = ir->nstcalcenergy;
 +        }
 +
 +        if (ir->comm_mode == ecmANGULAR)
 +        {
 +            sprintf(err_buf, "Can not remove the rotation around the center of mass with periodic molecules");
 +            CHECK(ir->bPeriodicMols);
 +            if (ir->ePBC != epbcNONE)
 +            {
 +                warning(wi, "Removing the rotation around the center of mass in a periodic system (this is not a problem when you have only one molecule).");
 +            }
 +        }
 +    }
 +
 +    if (EI_STATE_VELOCITY(ir->eI) && ir->ePBC == epbcNONE && ir->comm_mode != ecmANGULAR)
 +    {
 +        warning_note(wi, "Tumbling and or flying ice-cubes: We are not removing rotation around center of mass in a non-periodic system. You should probably set comm_mode = ANGULAR.");
 +    }
 +
 +    sprintf(err_buf, "Twin-range neighbour searching (NS) with simple NS"
 +            " algorithm not implemented");
 +    CHECK(((ir->rcoulomb > ir->rlist) || (ir->rvdw > ir->rlist))
 +          && (ir->ns_type == ensSIMPLE));
 +
 +    /* TEMPERATURE COUPLING */
 +    if (ir->etc == etcYES)
 +    {
 +        ir->etc = etcBERENDSEN;
 +        warning_note(wi, "Old option for temperature coupling given: "
 +                     "changing \"yes\" to \"Berendsen\"\n");
 +    }
 +
 +    if ((ir->etc == etcNOSEHOOVER) || (ir->epc == epcMTTK))
 +    {
 +        if (ir->opts.nhchainlength < 1)
 +        {
 +            sprintf(warn_buf, "number of Nose-Hoover chains (currently %d) cannot be less than 1,reset to 1\n", ir->opts.nhchainlength);
 +            ir->opts.nhchainlength = 1;
 +            warning(wi, warn_buf);
 +        }
 +
 +        if (ir->etc == etcNOSEHOOVER && !EI_VV(ir->eI) && ir->opts.nhchainlength > 1)
 +        {
 +            warning_note(wi, "leapfrog does not yet support Nose-Hoover chains, nhchainlength reset to 1");
 +            ir->opts.nhchainlength = 1;
 +        }
 +    }
 +    else
 +    {
 +        ir->opts.nhchainlength = 0;
 +    }
 +
 +    if (ir->eI == eiVVAK)
 +    {
 +        sprintf(err_buf, "%s implemented primarily for validation, and requires nsttcouple = 1 and nstpcouple = 1.",
 +                ei_names[eiVVAK]);
 +        CHECK((ir->nsttcouple != 1) || (ir->nstpcouple != 1));
 +    }
 +
 +    if (ETC_ANDERSEN(ir->etc))
 +    {
 +        sprintf(err_buf, "%s temperature control not supported for integrator %s.", etcoupl_names[ir->etc], ei_names[ir->eI]);
 +        CHECK(!(EI_VV(ir->eI)));
 +
 +        for (i = 0; i < ir->opts.ngtc; i++)
 +        {
 +            sprintf(err_buf, "all tau_t must currently be equal using Andersen temperature control, violated for group %d", i);
 +            CHECK(ir->opts.tau_t[0] != ir->opts.tau_t[i]);
 +            sprintf(err_buf, "all tau_t must be postive using Andersen temperature control, tau_t[%d]=%10.6f",
 +                    i, ir->opts.tau_t[i]);
 +            CHECK(ir->opts.tau_t[i] < 0);
 +        }
 +        if (ir->nstcomm > 0 && (ir->etc == etcANDERSEN))
 +        {
 +            sprintf(warn_buf, "Center of mass removal not necessary for %s.  All velocities of coupled groups are rerandomized periodically, so flying ice cube errors will not occur.", etcoupl_names[ir->etc]);
 +            warning_note(wi, warn_buf);
 +        }
 +
 +        sprintf(err_buf, "nstcomm must be 1, not %d for %s, as velocities of atoms in coupled groups are randomized every time step", ir->nstcomm, etcoupl_names[ir->etc]);
 +        CHECK(ir->nstcomm > 1 && (ir->etc == etcANDERSEN));
 +
 +        for (i = 0; i < ir->opts.ngtc; i++)
 +        {
 +            int nsteps = (int)(ir->opts.tau_t[i]/ir->delta_t);
 +            sprintf(err_buf, "tau_t/delta_t for group %d for temperature control method %s must be a multiple of nstcomm (%d), as velocities of atoms in coupled groups are randomized every time step. The input tau_t (%8.3f) leads to %d steps per randomization", i, etcoupl_names[ir->etc], ir->nstcomm, ir->opts.tau_t[i], nsteps);
 +            CHECK((nsteps % ir->nstcomm) && (ir->etc == etcANDERSENMASSIVE));
 +        }
 +    }
 +    if (ir->etc == etcBERENDSEN)
 +    {
 +        sprintf(warn_buf, "The %s thermostat does not generate the correct kinetic energy distribution. You might want to consider using the %s thermostat.",
 +                ETCOUPLTYPE(ir->etc), ETCOUPLTYPE(etcVRESCALE));
 +        warning_note(wi, warn_buf);
 +    }
 +
 +    if ((ir->etc == etcNOSEHOOVER || ETC_ANDERSEN(ir->etc))
 +        && ir->epc == epcBERENDSEN)
 +    {
 +        sprintf(warn_buf, "Using Berendsen pressure coupling invalidates the "
 +                "true ensemble for the thermostat");
 +        warning(wi, warn_buf);
 +    }
 +
 +    /* PRESSURE COUPLING */
 +    if (ir->epc == epcISOTROPIC)
 +    {
 +        ir->epc = epcBERENDSEN;
 +        warning_note(wi, "Old option for pressure coupling given: "
 +                     "changing \"Isotropic\" to \"Berendsen\"\n");
 +    }
 +
 +    if (ir->epc != epcNO)
 +    {
 +        dt_pcoupl = ir->nstpcouple*ir->delta_t;
 +
 +        sprintf(err_buf, "tau-p must be > 0 instead of %g\n", ir->tau_p);
 +        CHECK(ir->tau_p <= 0);
 +
 +        if (ir->tau_p/dt_pcoupl < pcouple_min_integration_steps(ir->epc))
 +        {
 +            sprintf(warn_buf, "For proper integration of the %s barostat, tau-p (%g) should be at least %d times larger than nstpcouple*dt (%g)",
 +                    EPCOUPLTYPE(ir->epc), ir->tau_p, pcouple_min_integration_steps(ir->epc), dt_pcoupl);
 +            warning(wi, warn_buf);
 +        }
 +
 +        sprintf(err_buf, "compressibility must be > 0 when using pressure"
 +                " coupling %s\n", EPCOUPLTYPE(ir->epc));
 +        CHECK(ir->compress[XX][XX] < 0 || ir->compress[YY][YY] < 0 ||
 +              ir->compress[ZZ][ZZ] < 0 ||
 +              (trace(ir->compress) == 0 && ir->compress[YY][XX] <= 0 &&
 +               ir->compress[ZZ][XX] <= 0 && ir->compress[ZZ][YY] <= 0));
 +
 +        if (epcPARRINELLORAHMAN == ir->epc && opts->bGenVel)
 +        {
 +            sprintf(warn_buf,
 +                    "You are generating velocities so I am assuming you "
 +                    "are equilibrating a system. You are using "
 +                    "%s pressure coupling, but this can be "
 +                    "unstable for equilibration. If your system crashes, try "
 +                    "equilibrating first with Berendsen pressure coupling. If "
 +                    "you are not equilibrating the system, you can probably "
 +                    "ignore this warning.",
 +                    epcoupl_names[ir->epc]);
 +            warning(wi, warn_buf);
 +        }
 +    }
 +
 +    if (EI_VV(ir->eI))
 +    {
 +        if (ir->epc > epcNO)
 +        {
 +            if ((ir->epc != epcBERENDSEN) && (ir->epc != epcMTTK))
 +            {
 +                warning_error(wi, "for md-vv and md-vv-avek, can only use Berendsen and Martyna-Tuckerman-Tobias-Klein (MTTK) equations for pressure control; MTTK is equivalent to Parrinello-Rahman.");
 +            }
 +        }
 +    }
 +
 +    /* ELECTROSTATICS */
 +    /* More checks are in triple check (grompp.c) */
 +
 +    if (ir->coulombtype == eelSWITCH)
 +    {
 +        sprintf(warn_buf, "coulombtype = %s is only for testing purposes and can lead to serious "
 +                "artifacts, advice: use coulombtype = %s",
 +                eel_names[ir->coulombtype],
 +                eel_names[eelRF_ZERO]);
 +        warning(wi, warn_buf);
 +    }
 +
 +    if (ir->epsilon_r != 1 && ir->implicit_solvent == eisGBSA)
 +    {
 +        sprintf(warn_buf, "epsilon-r = %g with GB implicit solvent, will use this value for inner dielectric", ir->epsilon_r);
 +        warning_note(wi, warn_buf);
 +    }
 +
 +    if (EEL_RF(ir->coulombtype) && ir->epsilon_rf == 1 && ir->epsilon_r != 1)
 +    {
 +        sprintf(warn_buf, "epsilon-r = %g and epsilon-rf = 1 with reaction field, proceeding assuming old format and exchanging epsilon-r and epsilon-rf", ir->epsilon_r);
 +        warning(wi, warn_buf);
 +        ir->epsilon_rf = ir->epsilon_r;
 +        ir->epsilon_r  = 1.0;
 +    }
 +
 +    if (getenv("GALACTIC_DYNAMICS") == NULL)
 +    {
 +        sprintf(err_buf, "epsilon-r must be >= 0 instead of %g\n", ir->epsilon_r);
 +        CHECK(ir->epsilon_r < 0);
 +    }
 +
 +    if (EEL_RF(ir->coulombtype))
 +    {
 +        /* reaction field (at the cut-off) */
 +
 +        if (ir->coulombtype == eelRF_ZERO)
 +        {
 +            sprintf(warn_buf, "With coulombtype = %s, epsilon-rf must be 0, assuming you meant epsilon_rf=0",
 +                    eel_names[ir->coulombtype]);
 +            CHECK(ir->epsilon_rf != 0);
 +            ir->epsilon_rf = 0.0;
 +        }
 +
 +        sprintf(err_buf, "epsilon-rf must be >= epsilon-r");
 +        CHECK((ir->epsilon_rf < ir->epsilon_r && ir->epsilon_rf != 0) ||
 +              (ir->epsilon_r == 0));
 +        if (ir->epsilon_rf == ir->epsilon_r)
 +        {
 +            sprintf(warn_buf, "Using epsilon-rf = epsilon-r with %s does not make sense",
 +                    eel_names[ir->coulombtype]);
 +            warning(wi, warn_buf);
 +        }
 +    }
 +    /* Allow rlist>rcoulomb for tabulated long range stuff. This just
 +     * means the interaction is zero outside rcoulomb, but it helps to
 +     * provide accurate energy conservation.
 +     */
 +    if (EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype))
 +    {
 +        if (EEL_SWITCHED(ir->coulombtype))
 +        {
 +            sprintf(err_buf,
 +                    "With coulombtype = %s rcoulomb_switch must be < rcoulomb. Or, better: Use the potential modifier options!",
 +                    eel_names[ir->coulombtype]);
 +            CHECK(ir->rcoulomb_switch >= ir->rcoulomb);
 +        }
 +    }
 +    else if (ir->coulombtype == eelCUT || EEL_RF(ir->coulombtype))
 +    {
 +        if (ir->cutoff_scheme == ecutsGROUP && ir->coulomb_modifier == eintmodNONE)
 +        {
 +            sprintf(err_buf, "With coulombtype = %s, rcoulomb should be >= rlist unless you use a potential modifier",
 +                    eel_names[ir->coulombtype]);
 +            CHECK(ir->rlist > ir->rcoulomb);
 +        }
 +    }
 +
 +    if (ir->coulombtype == eelSWITCH || ir->coulombtype == eelSHIFT ||
 +        ir->vdwtype == evdwSWITCH || ir->vdwtype == evdwSHIFT)
 +    {
 +        sprintf(warn_buf,
 +                "The switch/shift interaction settings are just for compatibility; you will get better "
 +                "performance from applying potential modifiers to your interactions!\n");
 +        warning_note(wi, warn_buf);
 +    }
 +
++    if (ir->coulombtype == eelPMESWITCH)
++    {
++        if (ir->rcoulomb_switch/ir->rcoulomb < 0.9499)
++        {
++            sprintf(warn_buf, "The switching range for %s should be 5%% or less, energy conservation will be good anyhow, since ewald_rtol = %g",
++                    eel_names[ir->coulombtype],
++                    ir->ewald_rtol);
++            warning(wi, warn_buf);
++        }
++    }
++
 +    if (EEL_FULL(ir->coulombtype))
 +    {
 +        if (ir->coulombtype == eelPMESWITCH || ir->coulombtype == eelPMEUSER ||
 +            ir->coulombtype == eelPMEUSERSWITCH)
 +        {
 +            sprintf(err_buf, "With coulombtype = %s, rcoulomb must be <= rlist",
 +                    eel_names[ir->coulombtype]);
 +            CHECK(ir->rcoulomb > ir->rlist);
 +        }
 +        else if (ir->cutoff_scheme == ecutsGROUP && ir->coulomb_modifier == eintmodNONE)
 +        {
 +            if (ir->coulombtype == eelPME || ir->coulombtype == eelP3M_AD)
 +            {
 +                sprintf(err_buf,
 +                        "With coulombtype = %s (without modifier), rcoulomb must be equal to rlist,\n"
 +                        "or rlistlong if nstcalclr=1. For optimal energy conservation,consider using\n"
 +                        "a potential modifier.", eel_names[ir->coulombtype]);
 +                if (ir->nstcalclr == 1)
 +                {
 +                    CHECK(ir->rcoulomb != ir->rlist && ir->rcoulomb != ir->rlistlong);
 +                }
 +                else
 +                {
 +                    CHECK(ir->rcoulomb != ir->rlist);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        if (ir->pme_order < 3)
 +        {
 +            warning_error(wi, "pme-order can not be smaller than 3");
 +        }
 +    }
 +
 +    if (ir->nwall == 2 && EEL_FULL(ir->coulombtype))
 +    {
 +        if (ir->ewald_geometry == eewg3D)
 +        {
 +            sprintf(warn_buf, "With pbc=%s you should use ewald-geometry=%s",
 +                    epbc_names[ir->ePBC], eewg_names[eewg3DC]);
 +            warning(wi, warn_buf);
 +        }
 +        /* This check avoids extra pbc coding for exclusion corrections */
 +        sprintf(err_buf, "wall-ewald-zfac should be >= 2");
 +        CHECK(ir->wall_ewald_zfac < 2);
 +    }
 +
 +    if (EVDW_SWITCHED(ir->vdwtype))
 +    {
 +        sprintf(err_buf, "With vdwtype = %s rvdw-switch must be < rvdw. Or, better - use a potential modifier.",
 +                evdw_names[ir->vdwtype]);
 +        CHECK(ir->rvdw_switch >= ir->rvdw);
 +    }
 +    else if (ir->vdwtype == evdwCUT)
 +    {
 +        if (ir->cutoff_scheme == ecutsGROUP && ir->vdw_modifier == eintmodNONE)
 +        {
 +            sprintf(err_buf, "With vdwtype = %s, rvdw must be >= rlist unless you use a potential modifier", evdw_names[ir->vdwtype]);
 +            CHECK(ir->rlist > ir->rvdw);
 +        }
 +    }
 +    if (ir->cutoff_scheme == ecutsGROUP)
 +    {
++        if (((ir->coulomb_modifier != eintmodNONE && ir->rcoulomb == ir->rlist) ||
++             (ir->vdw_modifier != eintmodNONE && ir->rvdw == ir->rlist)) &&
++            ir->nstlist != 1)
++        {
++            warning_note(wi, "With exact cut-offs, rlist should be "
++                         "larger than rcoulomb and rvdw, so that there "
++                         "is a buffer region for particle motion "
++                         "between neighborsearch steps");
++        }
++
 +        if (EEL_IS_ZERO_AT_CUTOFF(ir->coulombtype)
 +            && (ir->rlistlong <= ir->rcoulomb))
 +        {
 +            sprintf(warn_buf, "For energy conservation with switch/shift potentials, %s should be 0.1 to 0.3 nm larger than rcoulomb.",
 +                    IR_TWINRANGE(*ir) ? "rlistlong" : "rlist");
 +            warning_note(wi, warn_buf);
 +        }
 +        if (EVDW_SWITCHED(ir->vdwtype) && (ir->rlistlong <= ir->rvdw))
 +        {
 +            sprintf(warn_buf, "For energy conservation with switch/shift potentials, %s should be 0.1 to 0.3 nm larger than rvdw.",
 +                    IR_TWINRANGE(*ir) ? "rlistlong" : "rlist");
 +            warning_note(wi, warn_buf);
 +        }
 +    }
 +
 +    if (ir->vdwtype == evdwUSER && ir->eDispCorr != edispcNO)
 +    {
 +        warning_note(wi, "You have selected user tables with dispersion correction, the dispersion will be corrected to -C6/r^6 beyond rvdw_switch (the tabulated interaction between rvdw_switch and rvdw will not be double counted). Make sure that you really want dispersion correction to -C6/r^6.");
 +    }
 +
 +    if (ir->nstlist == -1)
 +    {
 +        sprintf(err_buf, "With nstlist=-1 rvdw and rcoulomb should be smaller than rlist to account for diffusion and possibly charge-group radii");
 +        CHECK(ir->rvdw >= ir->rlist || ir->rcoulomb >= ir->rlist);
 +    }
 +    sprintf(err_buf, "nstlist can not be smaller than -1");
 +    CHECK(ir->nstlist < -1);
 +
 +    if (ir->eI == eiLBFGS && (ir->coulombtype == eelCUT || ir->vdwtype == evdwCUT)
 +        && ir->rvdw != 0)
 +    {
 +        warning(wi, "For efficient BFGS minimization, use switch/shift/pme instead of cut-off.");
 +    }
 +
 +    if (ir->eI == eiLBFGS && ir->nbfgscorr <= 0)
 +    {
 +        warning(wi, "Using L-BFGS with nbfgscorr<=0 just gets you steepest descent.");
 +    }
 +
 +    /* ENERGY CONSERVATION */
 +    if (ir_NVE(ir) && ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        if (!EVDW_MIGHT_BE_ZERO_AT_CUTOFF(ir->vdwtype) && ir->rvdw > 0 && ir->vdw_modifier == eintmodNONE)
 +        {
 +            sprintf(warn_buf, "You are using a cut-off for VdW interactions with NVE, for good energy conservation use vdwtype = %s (possibly with DispCorr)",
 +                    evdw_names[evdwSHIFT]);
 +            warning_note(wi, warn_buf);
 +        }
 +        if (!EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype) && ir->rcoulomb > 0 && ir->coulomb_modifier == eintmodNONE)
 +        {
 +            sprintf(warn_buf, "You are using a cut-off for electrostatics with NVE, for good energy conservation use coulombtype = %s or %s",
 +                    eel_names[eelPMESWITCH], eel_names[eelRF_ZERO]);
 +            warning_note(wi, warn_buf);
 +        }
 +    }
 +
 +    /* IMPLICIT SOLVENT */
 +    if (ir->coulombtype == eelGB_NOTUSED)
 +    {
 +        ir->coulombtype      = eelCUT;
 +        ir->implicit_solvent = eisGBSA;
 +        fprintf(stderr, "Note: Old option for generalized born electrostatics given:\n"
 +                "Changing coulombtype from \"generalized-born\" to \"cut-off\" and instead\n"
 +                "setting implicit-solvent value to \"GBSA\" in input section.\n");
 +    }
 +
 +    if (ir->sa_algorithm == esaSTILL)
 +    {
 +        sprintf(err_buf, "Still SA algorithm not available yet, use %s or %s instead\n", esa_names[esaAPPROX], esa_names[esaNO]);
 +        CHECK(ir->sa_algorithm == esaSTILL);
 +    }
 +
 +    if (ir->implicit_solvent == eisGBSA)
 +    {
 +        sprintf(err_buf, "With GBSA implicit solvent, rgbradii must be equal to rlist.");
 +        CHECK(ir->rgbradii != ir->rlist);
 +
 +        if (ir->coulombtype != eelCUT)
 +        {
 +            sprintf(err_buf, "With GBSA, coulombtype must be equal to %s\n", eel_names[eelCUT]);
 +            CHECK(ir->coulombtype != eelCUT);
 +        }
 +        if (ir->vdwtype != evdwCUT)
 +        {
 +            sprintf(err_buf, "With GBSA, vdw-type must be equal to %s\n", evdw_names[evdwCUT]);
 +            CHECK(ir->vdwtype != evdwCUT);
 +        }
 +        if (ir->nstgbradii < 1)
 +        {
 +            sprintf(warn_buf, "Using GBSA with nstgbradii<1, setting nstgbradii=1");
 +            warning_note(wi, warn_buf);
 +            ir->nstgbradii = 1;
 +        }
 +        if (ir->sa_algorithm == esaNO)
 +        {
 +            sprintf(warn_buf, "No SA (non-polar) calculation requested together with GB. Are you sure this is what you want?\n");
 +            warning_note(wi, warn_buf);
 +        }
 +        if (ir->sa_surface_tension < 0 && ir->sa_algorithm != esaNO)
 +        {
 +            sprintf(warn_buf, "Value of sa_surface_tension is < 0. Changing it to 2.05016 or 2.25936 kJ/nm^2/mol for Still and HCT/OBC respectively\n");
 +            warning_note(wi, warn_buf);
 +
 +            if (ir->gb_algorithm == egbSTILL)
 +            {
 +                ir->sa_surface_tension = 0.0049 * CAL2JOULE * 100;
 +            }
 +            else
 +            {
 +                ir->sa_surface_tension = 0.0054 * CAL2JOULE * 100;
 +            }
 +        }
 +        if (ir->sa_surface_tension == 0 && ir->sa_algorithm != esaNO)
 +        {
 +            sprintf(err_buf, "Surface tension set to 0 while SA-calculation requested\n");
 +            CHECK(ir->sa_surface_tension == 0 && ir->sa_algorithm != esaNO);
 +        }
 +
 +    }
 +
 +    if (ir->bAdress)
 +    {
 +        if (ir->cutoff_scheme != ecutsGROUP)
 +        {
 +            warning_error(wi, "AdresS simulation supports only cutoff-scheme=group");
 +        }
 +        if (!EI_SD(ir->eI))
 +        {
 +            warning_error(wi, "AdresS simulation supports only stochastic dynamics");
 +        }
 +        if (ir->epc != epcNO)
 +        {
 +            warning_error(wi, "AdresS simulation does not support pressure coupling");
 +        }
 +        if (EEL_FULL(ir->coulombtype))
 +        {
 +            warning_error(wi, "AdresS simulation does not support long-range electrostatics");
 +        }
 +    }
 +}
 +
 +/* count the number of text elemets separated by whitespace in a string.
 +    str = the input string
 +    maxptr = the maximum number of allowed elements
 +    ptr = the output array of pointers to the first character of each element
 +    returns: the number of elements. */
 +int str_nelem(const char *str, int maxptr, char *ptr[])
 +{
 +    int   np = 0;
 +    char *copy0, *copy;
 +
 +    copy0 = strdup(str);
 +    copy  = copy0;
 +    ltrim(copy);
 +    while (*copy != '\0')
 +    {
 +        if (np >= maxptr)
 +        {
 +            gmx_fatal(FARGS, "Too many groups on line: '%s' (max is %d)",
 +                      str, maxptr);
 +        }
 +        if (ptr)
 +        {
 +            ptr[np] = copy;
 +        }
 +        np++;
 +        while ((*copy != '\0') && !isspace(*copy))
 +        {
 +            copy++;
 +        }
 +        if (*copy != '\0')
 +        {
 +            *copy = '\0';
 +            copy++;
 +        }
 +        ltrim(copy);
 +    }
 +    if (ptr == NULL)
 +    {
 +        sfree(copy0);
 +    }
 +
 +    return np;
 +}
 +
 +/* interpret a number of doubles from a string and put them in an array,
 +   after allocating space for them.
 +   str = the input string
 +   n = the (pre-allocated) number of doubles read
 +   r = the output array of doubles. */
 +static void parse_n_real(char *str, int *n, real **r)
 +{
 +    char *ptr[MAXPTR];
 +    int   i;
 +
 +    *n = str_nelem(str, MAXPTR, ptr);
 +
 +    snew(*r, *n);
 +    for (i = 0; i < *n; i++)
 +    {
 +        (*r)[i] = strtod(ptr[i], NULL);
 +    }
 +}
 +
 +static void do_fep_params(t_inputrec *ir, char fep_lambda[][STRLEN], char weights[STRLEN])
 +{
 +
 +    int         i, j, max_n_lambda, nweights, nfep[efptNR];
 +    t_lambda   *fep    = ir->fepvals;
 +    t_expanded *expand = ir->expandedvals;
 +    real      **count_fep_lambdas;
 +    gmx_bool    bOneLambda = TRUE;
 +
 +    snew(count_fep_lambdas, efptNR);
 +
 +    /* FEP input processing */
 +    /* first, identify the number of lambda values for each type.
 +       All that are nonzero must have the same number */
 +
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        parse_n_real(fep_lambda[i], &(nfep[i]), &(count_fep_lambdas[i]));
 +    }
 +
 +    /* now, determine the number of components.  All must be either zero, or equal. */
 +
 +    max_n_lambda = 0;
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        if (nfep[i] > max_n_lambda)
 +        {
 +            max_n_lambda = nfep[i];  /* here's a nonzero one.  All of them
 +                                        must have the same number if its not zero.*/
 +            break;
 +        }
 +    }
 +
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        if (nfep[i] == 0)
 +        {
 +            ir->fepvals->separate_dvdl[i] = FALSE;
 +        }
 +        else if (nfep[i] == max_n_lambda)
 +        {
 +            if (i != efptTEMPERATURE)  /* we treat this differently -- not really a reason to compute the derivative with
 +                                          respect to the temperature currently */
 +            {
 +                ir->fepvals->separate_dvdl[i] = TRUE;
 +            }
 +        }
 +        else
 +        {
 +            gmx_fatal(FARGS, "Number of lambdas (%d) for FEP type %s not equal to number of other types (%d)",
 +                      nfep[i], efpt_names[i], max_n_lambda);
 +        }
 +    }
 +    /* we don't print out dhdl if the temperature is changing, since we can't correctly define dhdl in this case */
 +    ir->fepvals->separate_dvdl[efptTEMPERATURE] = FALSE;
 +
 +    /* the number of lambdas is the number we've read in, which is either zero
 +       or the same for all */
 +    fep->n_lambda = max_n_lambda;
 +
 +    /* allocate space for the array of lambda values */
 +    snew(fep->all_lambda, efptNR);
 +    /* if init_lambda is defined, we need to set lambda */
 +    if ((fep->init_lambda > 0) && (fep->n_lambda == 0))
 +    {
 +        ir->fepvals->separate_dvdl[efptFEP] = TRUE;
 +    }
 +    /* otherwise allocate the space for all of the lambdas, and transfer the data */
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        snew(fep->all_lambda[i], fep->n_lambda);
 +        if (nfep[i] > 0)  /* if it's zero, then the count_fep_lambda arrays
 +                             are zero */
 +        {
 +            for (j = 0; j < fep->n_lambda; j++)
 +            {
 +                fep->all_lambda[i][j] = (double)count_fep_lambdas[i][j];
 +            }
 +            sfree(count_fep_lambdas[i]);
 +        }
 +    }
 +    sfree(count_fep_lambdas);
 +
 +    /* "fep-vals" is either zero or the full number. If zero, we'll need to define fep-lambdas for internal
 +       bookkeeping -- for now, init_lambda */
 +
 +    if ((nfep[efptFEP] == 0) && (fep->init_lambda >= 0))
 +    {
 +        for (i = 0; i < fep->n_lambda; i++)
 +        {
 +            fep->all_lambda[efptFEP][i] = fep->init_lambda;
 +        }
 +    }
 +
 +    /* check to see if only a single component lambda is defined, and soft core is defined.
 +       In this case, turn on coulomb soft core */
 +
 +    if (max_n_lambda == 0)
 +    {
 +        bOneLambda = TRUE;
 +    }
 +    else
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            if ((nfep[i] != 0) && (i != efptFEP))
 +            {
 +                bOneLambda = FALSE;
 +            }
 +        }
 +    }
 +    if ((bOneLambda) && (fep->sc_alpha > 0))
 +    {
 +        fep->bScCoul = TRUE;
 +    }
 +
 +    /* Fill in the others with the efptFEP if they are not explicitly
 +       specified (i.e. nfep[i] == 0).  This means if fep is not defined,
 +       they are all zero. */
 +
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        if ((nfep[i] == 0) && (i != efptFEP))
 +        {
 +            for (j = 0; j < fep->n_lambda; j++)
 +            {
 +                fep->all_lambda[i][j] = fep->all_lambda[efptFEP][j];
 +            }
 +        }
 +    }
 +
 +
 +    /* make it easier if sc_r_power = 48 by increasing it to the 4th power, to be in the right scale. */
 +    if (fep->sc_r_power == 48)
 +    {
 +        if (fep->sc_alpha > 0.1)
 +        {
 +            gmx_fatal(FARGS, "sc_alpha (%f) for sc_r_power = 48 should usually be between 0.001 and 0.004", fep->sc_alpha);
 +        }
 +    }
 +
 +    expand = ir->expandedvals;
 +    /* now read in the weights */
 +    parse_n_real(weights, &nweights, &(expand->init_lambda_weights));
 +    if (nweights == 0)
 +    {
 +        expand->bInit_weights = FALSE;
 +        snew(expand->init_lambda_weights, fep->n_lambda); /* initialize to zero */
 +    }
 +    else if (nweights != fep->n_lambda)
 +    {
 +        gmx_fatal(FARGS, "Number of weights (%d) is not equal to number of lambda values (%d)",
 +                  nweights, fep->n_lambda);
 +    }
 +    else
 +    {
 +        expand->bInit_weights = TRUE;
 +    }
 +    if ((expand->nstexpanded < 0) && (ir->efep != efepNO))
 +    {
 +        expand->nstexpanded = fep->nstdhdl;
 +        /* if you don't specify nstexpanded when doing expanded ensemble free energy calcs, it is set to nstdhdl */
 +    }
 +    if ((expand->nstexpanded < 0) && ir->bSimTemp)
 +    {
 +        expand->nstexpanded = 2*(int)(ir->opts.tau_t[0]/ir->delta_t);
 +        /* if you don't specify nstexpanded when doing expanded ensemble simulated tempering, it is set to
 +           2*tau_t just to be careful so it's not to frequent  */
 +    }
 +}
 +
 +
 +static void do_simtemp_params(t_inputrec *ir)
 +{
 +
 +    snew(ir->simtempvals->temperatures, ir->fepvals->n_lambda);
 +    GetSimTemps(ir->fepvals->n_lambda, ir->simtempvals, ir->fepvals->all_lambda[efptTEMPERATURE]);
 +
 +    return;
 +}
 +
 +static void do_wall_params(t_inputrec *ir,
 +                           char *wall_atomtype, char *wall_density,
 +                           t_gromppopts *opts)
 +{
 +    int    nstr, i;
 +    char  *names[MAXPTR];
 +    double dbl;
 +
 +    opts->wall_atomtype[0] = NULL;
 +    opts->wall_atomtype[1] = NULL;
 +
 +    ir->wall_atomtype[0] = -1;
 +    ir->wall_atomtype[1] = -1;
 +    ir->wall_density[0]  = 0;
 +    ir->wall_density[1]  = 0;
 +
 +    if (ir->nwall > 0)
 +    {
 +        nstr = str_nelem(wall_atomtype, MAXPTR, names);
 +        if (nstr != ir->nwall)
 +        {
 +            gmx_fatal(FARGS, "Expected %d elements for wall_atomtype, found %d",
 +                      ir->nwall, nstr);
 +        }
 +        for (i = 0; i < ir->nwall; i++)
 +        {
 +            opts->wall_atomtype[i] = strdup(names[i]);
 +        }
 +
 +        if (ir->wall_type == ewt93 || ir->wall_type == ewt104)
 +        {
 +            nstr = str_nelem(wall_density, MAXPTR, names);
 +            if (nstr != ir->nwall)
 +            {
 +                gmx_fatal(FARGS, "Expected %d elements for wall-density, found %d", ir->nwall, nstr);
 +            }
 +            for (i = 0; i < ir->nwall; i++)
 +            {
 +                sscanf(names[i], "%lf", &dbl);
 +                if (dbl <= 0)
 +                {
 +                    gmx_fatal(FARGS, "wall-density[%d] = %f\n", i, dbl);
 +                }
 +                ir->wall_density[i] = dbl;
 +            }
 +        }
 +    }
 +}
 +
 +static void add_wall_energrps(gmx_groups_t *groups, int nwall, t_symtab *symtab)
 +{
 +    int     i;
 +    t_grps *grps;
 +    char    str[STRLEN];
 +
 +    if (nwall > 0)
 +    {
 +        srenew(groups->grpname, groups->ngrpname+nwall);
 +        grps = &(groups->grps[egcENER]);
 +        srenew(grps->nm_ind, grps->nr+nwall);
 +        for (i = 0; i < nwall; i++)
 +        {
 +            sprintf(str, "wall%d", i);
 +            groups->grpname[groups->ngrpname] = put_symtab(symtab, str);
 +            grps->nm_ind[grps->nr++]          = groups->ngrpname++;
 +        }
 +    }
 +}
 +
 +void read_expandedparams(int *ninp_p, t_inpfile **inp_p,
 +                         t_expanded *expand, warninp_t wi)
 +{
 +    int        ninp, nerror = 0;
 +    t_inpfile *inp;
 +
 +    ninp   = *ninp_p;
 +    inp    = *inp_p;
 +
 +    /* read expanded ensemble parameters */
 +    CCTYPE ("expanded ensemble variables");
 +    ITYPE ("nstexpanded", expand->nstexpanded, -1);
 +    EETYPE("lmc-stats", expand->elamstats, elamstats_names);
 +    EETYPE("lmc-move", expand->elmcmove, elmcmove_names);
 +    EETYPE("lmc-weights-equil", expand->elmceq, elmceq_names);
 +    ITYPE ("weight-equil-number-all-lambda", expand->equil_n_at_lam, -1);
 +    ITYPE ("weight-equil-number-samples", expand->equil_samples, -1);
 +    ITYPE ("weight-equil-number-steps", expand->equil_steps, -1);
 +    RTYPE ("weight-equil-wl-delta", expand->equil_wl_delta, -1);
 +    RTYPE ("weight-equil-count-ratio", expand->equil_ratio, -1);
 +    CCTYPE("Seed for Monte Carlo in lambda space");
 +    ITYPE ("lmc-seed", expand->lmc_seed, -1);
 +    RTYPE ("mc-temperature", expand->mc_temp, -1);
 +    ITYPE ("lmc-repeats", expand->lmc_repeats, 1);
 +    ITYPE ("lmc-gibbsdelta", expand->gibbsdeltalam, -1);
 +    ITYPE ("lmc-forced-nstart", expand->lmc_forced_nstart, 0);
 +    EETYPE("symmetrized-transition-matrix", expand->bSymmetrizedTMatrix, yesno_names);
 +    ITYPE("nst-transition-matrix", expand->nstTij, -1);
 +    ITYPE ("mininum-var-min", expand->minvarmin, 100); /*default is reasonable */
 +    ITYPE ("weight-c-range", expand->c_range, 0);      /* default is just C=0 */
 +    RTYPE ("wl-scale", expand->wl_scale, 0.8);
 +    RTYPE ("wl-ratio", expand->wl_ratio, 0.8);
 +    RTYPE ("init-wl-delta", expand->init_wl_delta, 1.0);
 +    EETYPE("wl-oneovert", expand->bWLoneovert, yesno_names);
 +
 +    *ninp_p   = ninp;
 +    *inp_p    = inp;
 +
 +    return;
 +}
 +
 +void get_ir(const char *mdparin, const char *mdparout,
 +            t_inputrec *ir, t_gromppopts *opts,
 +            warninp_t wi)
 +{
 +    char       *dumstr[2];
 +    double      dumdub[2][6];
 +    t_inpfile  *inp;
 +    const char *tmp;
 +    int         i, j, m, ninp;
 +    char        warn_buf[STRLEN];
 +    t_lambda   *fep    = ir->fepvals;
 +    t_expanded *expand = ir->expandedvals;
 +
 +    inp = read_inpfile(mdparin, &ninp, NULL, wi);
 +
 +    snew(dumstr[0], STRLEN);
 +    snew(dumstr[1], STRLEN);
 +
 +    /* remove the following deprecated commands */
 +    REM_TYPE("title");
 +    REM_TYPE("cpp");
 +    REM_TYPE("domain-decomposition");
 +    REM_TYPE("andersen-seed");
 +    REM_TYPE("dihre");
 +    REM_TYPE("dihre-fc");
 +    REM_TYPE("dihre-tau");
 +    REM_TYPE("nstdihreout");
 +    REM_TYPE("nstcheckpoint");
 +
 +    /* replace the following commands with the clearer new versions*/
 +    REPL_TYPE("unconstrained-start", "continuation");
 +    REPL_TYPE("foreign-lambda", "fep-lambdas");
 +
 +    CCTYPE ("VARIOUS PREPROCESSING OPTIONS");
 +    CTYPE ("Preprocessor information: use cpp syntax.");
 +    CTYPE ("e.g.: -I/home/joe/doe -I/home/mary/roe");
 +    STYPE ("include", opts->include,  NULL);
 +    CTYPE ("e.g.: -DPOSRES -DFLEXIBLE (note these variable names are case sensitive)");
 +    STYPE ("define",  opts->define,   NULL);
 +
 +    CCTYPE ("RUN CONTROL PARAMETERS");
 +    EETYPE("integrator",  ir->eI,         ei_names);
 +    CTYPE ("Start time and timestep in ps");
 +    RTYPE ("tinit",   ir->init_t, 0.0);
 +    RTYPE ("dt",      ir->delta_t,    0.001);
 +    STEPTYPE ("nsteps",   ir->nsteps,     0);
 +    CTYPE ("For exact run continuation or redoing part of a run");
 +    STEPTYPE ("init-step", ir->init_step,  0);
 +    CTYPE ("Part index is updated automatically on checkpointing (keeps files separate)");
 +    ITYPE ("simulation-part", ir->simulation_part, 1);
 +    CTYPE ("mode for center of mass motion removal");
 +    EETYPE("comm-mode",   ir->comm_mode,  ecm_names);
 +    CTYPE ("number of steps for center of mass motion removal");
 +    ITYPE ("nstcomm", ir->nstcomm,    100);
 +    CTYPE ("group(s) for center of mass motion removal");
 +    STYPE ("comm-grps",   vcm,            NULL);
 +
 +    CCTYPE ("LANGEVIN DYNAMICS OPTIONS");
 +    CTYPE ("Friction coefficient (amu/ps) and random seed");
 +    RTYPE ("bd-fric",     ir->bd_fric,    0.0);
 +    ITYPE ("ld-seed",     ir->ld_seed,    1993);
 +
 +    /* Em stuff */
 +    CCTYPE ("ENERGY MINIMIZATION OPTIONS");
 +    CTYPE ("Force tolerance and initial step-size");
 +    RTYPE ("emtol",       ir->em_tol,     10.0);
 +    RTYPE ("emstep",      ir->em_stepsize, 0.01);
 +    CTYPE ("Max number of iterations in relax-shells");
 +    ITYPE ("niter",       ir->niter,      20);
 +    CTYPE ("Step size (ps^2) for minimization of flexible constraints");
 +    RTYPE ("fcstep",      ir->fc_stepsize, 0);
 +    CTYPE ("Frequency of steepest descents steps when doing CG");
 +    ITYPE ("nstcgsteep",  ir->nstcgsteep, 1000);
 +    ITYPE ("nbfgscorr",   ir->nbfgscorr,  10);
 +
 +    CCTYPE ("TEST PARTICLE INSERTION OPTIONS");
 +    RTYPE ("rtpi",    ir->rtpi,   0.05);
 +
 +    /* Output options */
 +    CCTYPE ("OUTPUT CONTROL OPTIONS");
 +    CTYPE ("Output frequency for coords (x), velocities (v) and forces (f)");
 +    ITYPE ("nstxout", ir->nstxout,    0);
 +    ITYPE ("nstvout", ir->nstvout,    0);
 +    ITYPE ("nstfout", ir->nstfout,    0);
 +    ir->nstcheckpoint = 1000;
 +    CTYPE ("Output frequency for energies to log file and energy file");
 +    ITYPE ("nstlog",  ir->nstlog, 1000);
 +    ITYPE ("nstcalcenergy", ir->nstcalcenergy, 100);
 +    ITYPE ("nstenergy",   ir->nstenergy,  1000);
 +    CTYPE ("Output frequency and precision for .xtc file");
 +    ITYPE ("nstxtcout",   ir->nstxtcout,  0);
 +    RTYPE ("xtc-precision", ir->xtcprec,   1000.0);
 +    CTYPE ("This selects the subset of atoms for the .xtc file. You can");
 +    CTYPE ("select multiple groups. By default all atoms will be written.");
 +    STYPE ("xtc-grps",    xtc_grps,       NULL);
 +    CTYPE ("Selection of energy groups");
 +    STYPE ("energygrps",  energy,         NULL);
 +
 +    /* Neighbor searching */
 +    CCTYPE ("NEIGHBORSEARCHING PARAMETERS");
 +    CTYPE ("cut-off scheme (group: using charge groups, Verlet: particle based cut-offs)");
 +    EETYPE("cutoff-scheme",     ir->cutoff_scheme,    ecutscheme_names);
 +    CTYPE ("nblist update frequency");
 +    ITYPE ("nstlist", ir->nstlist,    10);
 +    CTYPE ("ns algorithm (simple or grid)");
 +    EETYPE("ns-type",     ir->ns_type,    ens_names);
 +    /* set ndelta to the optimal value of 2 */
 +    ir->ndelta = 2;
 +    CTYPE ("Periodic boundary conditions: xyz, no, xy");
 +    EETYPE("pbc",         ir->ePBC,       epbc_names);
 +    EETYPE("periodic-molecules", ir->bPeriodicMols, yesno_names);
 +    CTYPE ("Allowed energy drift due to the Verlet buffer in kJ/mol/ps per atom,");
 +    CTYPE ("a value of -1 means: use rlist");
 +    RTYPE("verlet-buffer-drift", ir->verletbuf_drift,    0.005);
 +    CTYPE ("nblist cut-off");
 +    RTYPE ("rlist",   ir->rlist,  1.0);
 +    CTYPE ("long-range cut-off for switched potentials");
 +    RTYPE ("rlistlong",   ir->rlistlong,  -1);
 +    ITYPE ("nstcalclr",   ir->nstcalclr,  -1);
 +
 +    /* Electrostatics */
 +    CCTYPE ("OPTIONS FOR ELECTROSTATICS AND VDW");
 +    CTYPE ("Method for doing electrostatics");
 +    EETYPE("coulombtype", ir->coulombtype,    eel_names);
 +    EETYPE("coulomb-modifier",    ir->coulomb_modifier,    eintmod_names);
 +    CTYPE ("cut-off lengths");
 +    RTYPE ("rcoulomb-switch", ir->rcoulomb_switch,    0.0);
 +    RTYPE ("rcoulomb",    ir->rcoulomb,   1.0);
 +    CTYPE ("Relative dielectric constant for the medium and the reaction field");
 +    RTYPE ("epsilon-r",   ir->epsilon_r,  1.0);
 +    RTYPE ("epsilon-rf",  ir->epsilon_rf, 0.0);
 +    CTYPE ("Method for doing Van der Waals");
 +    EETYPE("vdw-type",    ir->vdwtype,    evdw_names);
 +    EETYPE("vdw-modifier",    ir->vdw_modifier,    eintmod_names);
 +    CTYPE ("cut-off lengths");
 +    RTYPE ("rvdw-switch", ir->rvdw_switch,    0.0);
 +    RTYPE ("rvdw",    ir->rvdw,   1.0);
 +    CTYPE ("Apply long range dispersion corrections for Energy and Pressure");
 +    EETYPE("DispCorr",    ir->eDispCorr,  edispc_names);
 +    CTYPE ("Extension of the potential lookup tables beyond the cut-off");
 +    RTYPE ("table-extension", ir->tabext, 1.0);
 +    CTYPE ("Separate tables between energy group pairs");
 +    STYPE ("energygrp-table", egptable,   NULL);
 +    CTYPE ("Spacing for the PME/PPPM FFT grid");
 +    RTYPE ("fourierspacing", ir->fourier_spacing, 0.12);
 +    CTYPE ("FFT grid size, when a value is 0 fourierspacing will be used");
 +    ITYPE ("fourier-nx",  ir->nkx,         0);
 +    ITYPE ("fourier-ny",  ir->nky,         0);
 +    ITYPE ("fourier-nz",  ir->nkz,         0);
 +    CTYPE ("EWALD/PME/PPPM parameters");
 +    ITYPE ("pme-order",   ir->pme_order,   4);
 +    RTYPE ("ewald-rtol",  ir->ewald_rtol, 0.00001);
 +    EETYPE("ewald-geometry", ir->ewald_geometry, eewg_names);
 +    RTYPE ("epsilon-surface", ir->epsilon_surface, 0.0);
 +    EETYPE("optimize-fft", ir->bOptFFT,  yesno_names);
 +
 +    CCTYPE("IMPLICIT SOLVENT ALGORITHM");
 +    EETYPE("implicit-solvent", ir->implicit_solvent, eis_names);
 +
 +    CCTYPE ("GENERALIZED BORN ELECTROSTATICS");
 +    CTYPE ("Algorithm for calculating Born radii");
 +    EETYPE("gb-algorithm", ir->gb_algorithm, egb_names);
 +    CTYPE ("Frequency of calculating the Born radii inside rlist");
 +    ITYPE ("nstgbradii", ir->nstgbradii, 1);
 +    CTYPE ("Cutoff for Born radii calculation; the contribution from atoms");
 +    CTYPE ("between rlist and rgbradii is updated every nstlist steps");
 +    RTYPE ("rgbradii",  ir->rgbradii, 1.0);
 +    CTYPE ("Dielectric coefficient of the implicit solvent");
 +    RTYPE ("gb-epsilon-solvent", ir->gb_epsilon_solvent, 80.0);
 +    CTYPE ("Salt concentration in M for Generalized Born models");
 +    RTYPE ("gb-saltconc",  ir->gb_saltconc, 0.0);
 +    CTYPE ("Scaling factors used in the OBC GB model. Default values are OBC(II)");
 +    RTYPE ("gb-obc-alpha", ir->gb_obc_alpha, 1.0);
 +    RTYPE ("gb-obc-beta", ir->gb_obc_beta, 0.8);
 +    RTYPE ("gb-obc-gamma", ir->gb_obc_gamma, 4.85);
 +    RTYPE ("gb-dielectric-offset", ir->gb_dielectric_offset, 0.009);
 +    EETYPE("sa-algorithm", ir->sa_algorithm, esa_names);
 +    CTYPE ("Surface tension (kJ/mol/nm^2) for the SA (nonpolar surface) part of GBSA");
 +    CTYPE ("The value -1 will set default value for Still/HCT/OBC GB-models.");
 +    RTYPE ("sa-surface-tension", ir->sa_surface_tension, -1);
 +
 +    /* Coupling stuff */
 +    CCTYPE ("OPTIONS FOR WEAK COUPLING ALGORITHMS");
 +    CTYPE ("Temperature coupling");
 +    EETYPE("tcoupl",  ir->etc,        etcoupl_names);
 +    ITYPE ("nsttcouple", ir->nsttcouple,  -1);
 +    ITYPE("nh-chain-length",     ir->opts.nhchainlength, NHCHAINLENGTH);
 +    EETYPE("print-nose-hoover-chain-variables", ir->bPrintNHChains, yesno_names);
 +    CTYPE ("Groups to couple separately");
 +    STYPE ("tc-grps",     tcgrps,         NULL);
 +    CTYPE ("Time constant (ps) and reference temperature (K)");
 +    STYPE ("tau-t",   tau_t,      NULL);
 +    STYPE ("ref-t",   ref_t,      NULL);
 +    CTYPE ("pressure coupling");
 +    EETYPE("pcoupl",  ir->epc,        epcoupl_names);
 +    EETYPE("pcoupltype",  ir->epct,       epcoupltype_names);
 +    ITYPE ("nstpcouple", ir->nstpcouple,  -1);
 +    CTYPE ("Time constant (ps), compressibility (1/bar) and reference P (bar)");
 +    RTYPE ("tau-p",   ir->tau_p,  1.0);
 +    STYPE ("compressibility", dumstr[0],  NULL);
 +    STYPE ("ref-p",       dumstr[1],      NULL);
 +    CTYPE ("Scaling of reference coordinates, No, All or COM");
 +    EETYPE ("refcoord-scaling", ir->refcoord_scaling, erefscaling_names);
 +
 +    /* QMMM */
 +    CCTYPE ("OPTIONS FOR QMMM calculations");
 +    EETYPE("QMMM", ir->bQMMM, yesno_names);
 +    CTYPE ("Groups treated Quantum Mechanically");
 +    STYPE ("QMMM-grps",  QMMM,          NULL);
 +    CTYPE ("QM method");
 +    STYPE("QMmethod",     QMmethod, NULL);
 +    CTYPE ("QMMM scheme");
 +    EETYPE("QMMMscheme",  ir->QMMMscheme,    eQMMMscheme_names);
 +    CTYPE ("QM basisset");
 +    STYPE("QMbasis",      QMbasis, NULL);
 +    CTYPE ("QM charge");
 +    STYPE ("QMcharge",    QMcharge, NULL);
 +    CTYPE ("QM multiplicity");
 +    STYPE ("QMmult",      QMmult, NULL);
 +    CTYPE ("Surface Hopping");
 +    STYPE ("SH",          bSH, NULL);
 +    CTYPE ("CAS space options");
 +    STYPE ("CASorbitals",      CASorbitals,   NULL);
 +    STYPE ("CASelectrons",     CASelectrons,  NULL);
 +    STYPE ("SAon", SAon, NULL);
 +    STYPE ("SAoff", SAoff, NULL);
 +    STYPE ("SAsteps",  SAsteps, NULL);
 +    CTYPE ("Scale factor for MM charges");
 +    RTYPE ("MMChargeScaleFactor", ir->scalefactor, 1.0);
 +    CTYPE ("Optimization of QM subsystem");
 +    STYPE ("bOPT",          bOPT, NULL);
 +    STYPE ("bTS",          bTS, NULL);
 +
 +    /* Simulated annealing */
 +    CCTYPE("SIMULATED ANNEALING");
 +    CTYPE ("Type of annealing for each temperature group (no/single/periodic)");
 +    STYPE ("annealing",   anneal,      NULL);
 +    CTYPE ("Number of time points to use for specifying annealing in each group");
 +    STYPE ("annealing-npoints", anneal_npoints, NULL);
 +    CTYPE ("List of times at the annealing points for each group");
 +    STYPE ("annealing-time",       anneal_time,       NULL);
 +    CTYPE ("Temp. at each annealing point, for each group.");
 +    STYPE ("annealing-temp",  anneal_temp,  NULL);
 +
 +    /* Startup run */
 +    CCTYPE ("GENERATE VELOCITIES FOR STARTUP RUN");
 +    EETYPE("gen-vel",     opts->bGenVel,  yesno_names);
 +    RTYPE ("gen-temp",    opts->tempi,    300.0);
 +    ITYPE ("gen-seed",    opts->seed,     173529);
 +
 +    /* Shake stuff */
 +    CCTYPE ("OPTIONS FOR BONDS");
 +    EETYPE("constraints", opts->nshake,   constraints);
 +    CTYPE ("Type of constraint algorithm");
 +    EETYPE("constraint-algorithm",  ir->eConstrAlg, econstr_names);
 +    CTYPE ("Do not constrain the start configuration");
 +    EETYPE("continuation", ir->bContinuation, yesno_names);
 +    CTYPE ("Use successive overrelaxation to reduce the number of shake iterations");
 +    EETYPE("Shake-SOR", ir->bShakeSOR, yesno_names);
 +    CTYPE ("Relative tolerance of shake");
 +    RTYPE ("shake-tol", ir->shake_tol, 0.0001);
 +    CTYPE ("Highest order in the expansion of the constraint coupling matrix");
 +    ITYPE ("lincs-order", ir->nProjOrder, 4);
 +    CTYPE ("Number of iterations in the final step of LINCS. 1 is fine for");
 +    CTYPE ("normal simulations, but use 2 to conserve energy in NVE runs.");
 +    CTYPE ("For energy minimization with constraints it should be 4 to 8.");
 +    ITYPE ("lincs-iter", ir->nLincsIter, 1);
 +    CTYPE ("Lincs will write a warning to the stderr if in one step a bond");
 +    CTYPE ("rotates over more degrees than");
 +    RTYPE ("lincs-warnangle", ir->LincsWarnAngle, 30.0);
 +    CTYPE ("Convert harmonic bonds to morse potentials");
 +    EETYPE("morse",       opts->bMorse, yesno_names);
 +
 +    /* Energy group exclusions */
 +    CCTYPE ("ENERGY GROUP EXCLUSIONS");
 +    CTYPE ("Pairs of energy groups for which all non-bonded interactions are excluded");
 +    STYPE ("energygrp-excl", egpexcl,     NULL);
 +
 +    /* Walls */
 +    CCTYPE ("WALLS");
 +    CTYPE ("Number of walls, type, atom types, densities and box-z scale factor for Ewald");
 +    ITYPE ("nwall", ir->nwall, 0);
 +    EETYPE("wall-type",     ir->wall_type,   ewt_names);
 +    RTYPE ("wall-r-linpot", ir->wall_r_linpot, -1);
 +    STYPE ("wall-atomtype", wall_atomtype, NULL);
 +    STYPE ("wall-density",  wall_density,  NULL);
 +    RTYPE ("wall-ewald-zfac", ir->wall_ewald_zfac, 3);
 +
 +    /* COM pulling */
 +    CCTYPE("COM PULLING");
 +    CTYPE("Pull type: no, umbrella, constraint or constant-force");
 +    EETYPE("pull",          ir->ePull, epull_names);
 +    if (ir->ePull != epullNO)
 +    {
 +        snew(ir->pull, 1);
 +        pull_grp = read_pullparams(&ninp, &inp, ir->pull, &opts->pull_start, wi);
 +    }
 +
 +    /* Enforced rotation */
 +    CCTYPE("ENFORCED ROTATION");
 +    CTYPE("Enforced rotation: No or Yes");
 +    EETYPE("rotation",       ir->bRot, yesno_names);
 +    if (ir->bRot)
 +    {
 +        snew(ir->rot, 1);
 +        rot_grp = read_rotparams(&ninp, &inp, ir->rot, wi);
 +    }
 +
 +    /* Refinement */
 +    CCTYPE("NMR refinement stuff");
 +    CTYPE ("Distance restraints type: No, Simple or Ensemble");
 +    EETYPE("disre",       ir->eDisre,     edisre_names);
 +    CTYPE ("Force weighting of pairs in one distance restraint: Conservative or Equal");
 +    EETYPE("disre-weighting", ir->eDisreWeighting, edisreweighting_names);
 +    CTYPE ("Use sqrt of the time averaged times the instantaneous violation");
 +    EETYPE("disre-mixed", ir->bDisreMixed, yesno_names);
 +    RTYPE ("disre-fc",    ir->dr_fc,  1000.0);
 +    RTYPE ("disre-tau",   ir->dr_tau, 0.0);
 +    CTYPE ("Output frequency for pair distances to energy file");
 +    ITYPE ("nstdisreout", ir->nstdisreout, 100);
 +    CTYPE ("Orientation restraints: No or Yes");
 +    EETYPE("orire",       opts->bOrire,   yesno_names);
 +    CTYPE ("Orientation restraints force constant and tau for time averaging");
 +    RTYPE ("orire-fc",    ir->orires_fc,  0.0);
 +    RTYPE ("orire-tau",   ir->orires_tau, 0.0);
 +    STYPE ("orire-fitgrp", orirefitgrp,    NULL);
 +    CTYPE ("Output frequency for trace(SD) and S to energy file");
 +    ITYPE ("nstorireout", ir->nstorireout, 100);
 +
 +    /* free energy variables */
 +    CCTYPE ("Free energy variables");
 +    EETYPE("free-energy", ir->efep, efep_names);
 +    STYPE ("couple-moltype",  couple_moltype,  NULL);
 +    EETYPE("couple-lambda0", opts->couple_lam0, couple_lam);
 +    EETYPE("couple-lambda1", opts->couple_lam1, couple_lam);
 +    EETYPE("couple-intramol", opts->bCoupleIntra, yesno_names);
 +
 +    RTYPE ("init-lambda", fep->init_lambda, -1); /* start with -1 so
 +                                                    we can recognize if
 +                                                    it was not entered */
 +    ITYPE ("init-lambda-state", fep->init_fep_state, -1);
 +    RTYPE ("delta-lambda", fep->delta_lambda, 0.0);
 +    ITYPE ("nstdhdl", fep->nstdhdl, 50);
 +    STYPE ("fep-lambdas", fep_lambda[efptFEP], NULL);
 +    STYPE ("mass-lambdas", fep_lambda[efptMASS], NULL);
 +    STYPE ("coul-lambdas", fep_lambda[efptCOUL], NULL);
 +    STYPE ("vdw-lambdas", fep_lambda[efptVDW], NULL);
 +    STYPE ("bonded-lambdas", fep_lambda[efptBONDED], NULL);
 +    STYPE ("restraint-lambdas", fep_lambda[efptRESTRAINT], NULL);
 +    STYPE ("temperature-lambdas", fep_lambda[efptTEMPERATURE], NULL);
 +    ITYPE ("calc-lambda-neighbors", fep->lambda_neighbors, 1);
 +    STYPE ("init-lambda-weights", lambda_weights, NULL);
 +    EETYPE("dhdl-print-energy", fep->bPrintEnergy, yesno_names);
 +    RTYPE ("sc-alpha", fep->sc_alpha, 0.0);
 +    ITYPE ("sc-power", fep->sc_power, 1);
 +    RTYPE ("sc-r-power", fep->sc_r_power, 6.0);
 +    RTYPE ("sc-sigma", fep->sc_sigma, 0.3);
 +    EETYPE("sc-coul", fep->bScCoul, yesno_names);
 +    ITYPE ("dh_hist_size", fep->dh_hist_size, 0);
 +    RTYPE ("dh_hist_spacing", fep->dh_hist_spacing, 0.1);
 +    EETYPE("separate-dhdl-file", fep->separate_dhdl_file,
 +           separate_dhdl_file_names);
 +    EETYPE("dhdl-derivatives", fep->dhdl_derivatives, dhdl_derivatives_names);
 +    ITYPE ("dh_hist_size", fep->dh_hist_size, 0);
 +    RTYPE ("dh_hist_spacing", fep->dh_hist_spacing, 0.1);
 +
 +    /* Non-equilibrium MD stuff */
 +    CCTYPE("Non-equilibrium MD stuff");
 +    STYPE ("acc-grps",    accgrps,        NULL);
 +    STYPE ("accelerate",  acc,            NULL);
 +    STYPE ("freezegrps",  freeze,         NULL);
 +    STYPE ("freezedim",   frdim,          NULL);
 +    RTYPE ("cos-acceleration", ir->cos_accel, 0);
 +    STYPE ("deform",      deform,         NULL);
 +
 +    /* simulated tempering variables */
 +    CCTYPE("simulated tempering variables");
 +    EETYPE("simulated-tempering", ir->bSimTemp, yesno_names);
 +    EETYPE("simulated-tempering-scaling", ir->simtempvals->eSimTempScale, esimtemp_names);
 +    RTYPE("sim-temp-low", ir->simtempvals->simtemp_low, 300.0);
 +    RTYPE("sim-temp-high", ir->simtempvals->simtemp_high, 300.0);
 +
 +    /* expanded ensemble variables */
 +    if (ir->efep == efepEXPANDED || ir->bSimTemp)
 +    {
 +        read_expandedparams(&ninp, &inp, expand, wi);
 +    }
 +
 +    /* Electric fields */
 +    CCTYPE("Electric fields");
 +    CTYPE ("Format is number of terms (int) and for all terms an amplitude (real)");
 +    CTYPE ("and a phase angle (real)");
 +    STYPE ("E-x",     efield_x,   NULL);
 +    STYPE ("E-xt",    efield_xt,  NULL);
 +    STYPE ("E-y",     efield_y,   NULL);
 +    STYPE ("E-yt",    efield_yt,  NULL);
 +    STYPE ("E-z",     efield_z,   NULL);
 +    STYPE ("E-zt",    efield_zt,  NULL);
 +
 +    /* AdResS defined thingies */
 +    CCTYPE ("AdResS parameters");
 +    EETYPE("adress",       ir->bAdress, yesno_names);
 +    if (ir->bAdress)
 +    {
 +        snew(ir->adress, 1);
 +        read_adressparams(&ninp, &inp, ir->adress, wi);
 +    }
 +
 +    /* User defined thingies */
 +    CCTYPE ("User defined thingies");
 +    STYPE ("user1-grps",  user1,          NULL);
 +    STYPE ("user2-grps",  user2,          NULL);
 +    ITYPE ("userint1",    ir->userint1,   0);
 +    ITYPE ("userint2",    ir->userint2,   0);
 +    ITYPE ("userint3",    ir->userint3,   0);
 +    ITYPE ("userint4",    ir->userint4,   0);
 +    RTYPE ("userreal1",   ir->userreal1,  0);
 +    RTYPE ("userreal2",   ir->userreal2,  0);
 +    RTYPE ("userreal3",   ir->userreal3,  0);
 +    RTYPE ("userreal4",   ir->userreal4,  0);
 +#undef CTYPE
 +
 +    write_inpfile(mdparout, ninp, inp, FALSE, wi);
 +    for (i = 0; (i < ninp); i++)
 +    {
 +        sfree(inp[i].name);
 +        sfree(inp[i].value);
 +    }
 +    sfree(inp);
 +
 +    /* Process options if necessary */
 +    for (m = 0; m < 2; m++)
 +    {
 +        for (i = 0; i < 2*DIM; i++)
 +        {
 +            dumdub[m][i] = 0.0;
 +        }
 +        if (ir->epc)
 +        {
 +            switch (ir->epct)
 +            {
 +                case epctISOTROPIC:
 +                    if (sscanf(dumstr[m], "%lf", &(dumdub[m][XX])) != 1)
 +                    {
 +                        warning_error(wi, "Pressure coupling not enough values (I need 1)");
 +                    }
 +                    dumdub[m][YY] = dumdub[m][ZZ] = dumdub[m][XX];
 +                    break;
 +                case epctSEMIISOTROPIC:
 +                case epctSURFACETENSION:
 +                    if (sscanf(dumstr[m], "%lf%lf",
 +                               &(dumdub[m][XX]), &(dumdub[m][ZZ])) != 2)
 +                    {
 +                        warning_error(wi, "Pressure coupling not enough values (I need 2)");
 +                    }
 +                    dumdub[m][YY] = dumdub[m][XX];
 +                    break;
 +                case epctANISOTROPIC:
 +                    if (sscanf(dumstr[m], "%lf%lf%lf%lf%lf%lf",
 +                               &(dumdub[m][XX]), &(dumdub[m][YY]), &(dumdub[m][ZZ]),
 +                               &(dumdub[m][3]), &(dumdub[m][4]), &(dumdub[m][5])) != 6)
 +                    {
 +                        warning_error(wi, "Pressure coupling not enough values (I need 6)");
 +                    }
 +                    break;
 +                default:
 +                    gmx_fatal(FARGS, "Pressure coupling type %s not implemented yet",
 +                              epcoupltype_names[ir->epct]);
 +            }
 +        }
 +    }
 +    clear_mat(ir->ref_p);
 +    clear_mat(ir->compress);
 +    for (i = 0; i < DIM; i++)
 +    {
 +        ir->ref_p[i][i]    = dumdub[1][i];
 +        ir->compress[i][i] = dumdub[0][i];
 +    }
 +    if (ir->epct == epctANISOTROPIC)
 +    {
 +        ir->ref_p[XX][YY] = dumdub[1][3];
 +        ir->ref_p[XX][ZZ] = dumdub[1][4];
 +        ir->ref_p[YY][ZZ] = dumdub[1][5];
 +        if (ir->ref_p[XX][YY] != 0 && ir->ref_p[XX][ZZ] != 0 && ir->ref_p[YY][ZZ] != 0)
 +        {
 +            warning(wi, "All off-diagonal reference pressures are non-zero. Are you sure you want to apply a threefold shear stress?\n");
 +        }
 +        ir->compress[XX][YY] = dumdub[0][3];
 +        ir->compress[XX][ZZ] = dumdub[0][4];
 +        ir->compress[YY][ZZ] = dumdub[0][5];
 +        for (i = 0; i < DIM; i++)
 +        {
 +            for (m = 0; m < i; m++)
 +            {
 +                ir->ref_p[i][m]    = ir->ref_p[m][i];
 +                ir->compress[i][m] = ir->compress[m][i];
 +            }
 +        }
 +    }
 +
 +    if (ir->comm_mode == ecmNO)
 +    {
 +        ir->nstcomm = 0;
 +    }
 +
 +    opts->couple_moltype = NULL;
 +    if (strlen(couple_moltype) > 0)
 +    {
 +        if (ir->efep != efepNO)
 +        {
 +            opts->couple_moltype = strdup(couple_moltype);
 +            if (opts->couple_lam0 == opts->couple_lam1)
 +            {
 +                warning(wi, "The lambda=0 and lambda=1 states for coupling are identical");
 +            }
 +            if (ir->eI == eiMD && (opts->couple_lam0 == ecouplamNONE ||
 +                                   opts->couple_lam1 == ecouplamNONE))
 +            {
 +                warning(wi, "For proper sampling of the (nearly) decoupled state, stochastic dynamics should be used");
 +            }
 +        }
 +        else
 +        {
 +            warning(wi, "Can not couple a molecule with free_energy = no");
 +        }
 +    }
 +    /* FREE ENERGY AND EXPANDED ENSEMBLE OPTIONS */
 +    if (ir->efep != efepNO)
 +    {
 +        if (fep->delta_lambda > 0)
 +        {
 +            ir->efep = efepSLOWGROWTH;
 +        }
 +    }
 +
 +    if (ir->bSimTemp)
 +    {
 +        fep->bPrintEnergy = TRUE;
 +        /* always print out the energy to dhdl if we are doing expanded ensemble, since we need the total energy
 +           if the temperature is changing. */
 +    }
 +
 +    if ((ir->efep != efepNO) || ir->bSimTemp)
 +    {
 +        ir->bExpanded = FALSE;
 +        if ((ir->efep == efepEXPANDED) || ir->bSimTemp)
 +        {
 +            ir->bExpanded = TRUE;
 +        }
 +        do_fep_params(ir, fep_lambda, lambda_weights);
 +        if (ir->bSimTemp) /* done after fep params */
 +        {
 +            do_simtemp_params(ir);
 +        }
 +    }
 +    else
 +    {
 +        ir->fepvals->n_lambda = 0;
 +    }
 +
 +    /* WALL PARAMETERS */
 +
 +    do_wall_params(ir, wall_atomtype, wall_density, opts);
 +
 +    /* ORIENTATION RESTRAINT PARAMETERS */
 +
 +    if (opts->bOrire && str_nelem(orirefitgrp, MAXPTR, NULL) != 1)
 +    {
 +        warning_error(wi, "ERROR: Need one orientation restraint fit group\n");
 +    }
 +
 +    /* DEFORMATION PARAMETERS */
 +
 +    clear_mat(ir->deform);
 +    for (i = 0; i < 6; i++)
 +    {
 +        dumdub[0][i] = 0;
 +    }
 +    m = sscanf(deform, "%lf %lf %lf %lf %lf %lf",
 +               &(dumdub[0][0]), &(dumdub[0][1]), &(dumdub[0][2]),
 +               &(dumdub[0][3]), &(dumdub[0][4]), &(dumdub[0][5]));
 +    for (i = 0; i < 3; i++)
 +    {
 +        ir->deform[i][i] = dumdub[0][i];
 +    }
 +    ir->deform[YY][XX] = dumdub[0][3];
 +    ir->deform[ZZ][XX] = dumdub[0][4];
 +    ir->deform[ZZ][YY] = dumdub[0][5];
 +    if (ir->epc != epcNO)
 +    {
 +        for (i = 0; i < 3; i++)
 +        {
 +            for (j = 0; j <= i; j++)
 +            {
 +                if (ir->deform[i][j] != 0 && ir->compress[i][j] != 0)
 +                {
 +                    warning_error(wi, "A box element has deform set and compressibility > 0");
 +                }
 +            }
 +        }
 +        for (i = 0; i < 3; i++)
 +        {
 +            for (j = 0; j < i; j++)
 +            {
 +                if (ir->deform[i][j] != 0)
 +                {
 +                    for (m = j; m < DIM; m++)
 +                    {
 +                        if (ir->compress[m][j] != 0)
 +                        {
 +                            sprintf(warn_buf, "An off-diagonal box element has deform set while compressibility > 0 for the same component of another box vector, this might lead to spurious periodicity effects.");
 +                            warning(wi, warn_buf);
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    sfree(dumstr[0]);
 +    sfree(dumstr[1]);
 +}
 +
 +static int search_QMstring(char *s, int ng, const char *gn[])
 +{
 +    /* same as normal search_string, but this one searches QM strings */
 +    int i;
 +
 +    for (i = 0; (i < ng); i++)
 +    {
 +        if (gmx_strcasecmp(s, gn[i]) == 0)
 +        {
 +            return i;
 +        }
 +    }
 +
 +    gmx_fatal(FARGS, "this QM method or basisset (%s) is not implemented\n!", s);
 +
 +    return -1;
 +
 +} /* search_QMstring */
 +
 +
 +int search_string(char *s, int ng, char *gn[])
 +{
 +    int i;
 +
 +    for (i = 0; (i < ng); i++)
 +    {
 +        if (gmx_strcasecmp(s, gn[i]) == 0)
 +        {
 +            return i;
 +        }
 +    }
 +
 +    gmx_fatal(FARGS,
 +              "Group %s referenced in the .mdp file was not found in the index file.\n"
 +              "Group names must match either [moleculetype] names or custom index group\n"
 +              "names, in which case you must supply an index file to the '-n' option\n"
 +              "of grompp.",
 +              s);
 +
 +    return -1;
 +}
 +
 +static gmx_bool do_numbering(int natoms, gmx_groups_t *groups, int ng, char *ptrs[],
 +                             t_blocka *block, char *gnames[],
 +                             int gtype, int restnm,
 +                             int grptp, gmx_bool bVerbose,
 +                             warninp_t wi)
 +{
 +    unsigned short *cbuf;
 +    t_grps         *grps = &(groups->grps[gtype]);
 +    int             i, j, gid, aj, ognr, ntot = 0;
 +    const char     *title;
 +    gmx_bool        bRest;
 +    char            warn_buf[STRLEN];
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Starting numbering %d groups of type %d\n", ng, gtype);
 +    }
 +
 +    title = gtypes[gtype];
 +
 +    snew(cbuf, natoms);
 +    /* Mark all id's as not set */
 +    for (i = 0; (i < natoms); i++)
 +    {
 +        cbuf[i] = NOGID;
 +    }
 +
 +    snew(grps->nm_ind, ng+1); /* +1 for possible rest group */
 +    for (i = 0; (i < ng); i++)
 +    {
 +        /* Lookup the group name in the block structure */
 +        gid = search_string(ptrs[i], block->nr, gnames);
 +        if ((grptp != egrptpONE) || (i == 0))
 +        {
 +            grps->nm_ind[grps->nr++] = gid;
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "Found gid %d for group %s\n", gid, ptrs[i]);
 +        }
 +
 +        /* Now go over the atoms in the group */
 +        for (j = block->index[gid]; (j < block->index[gid+1]); j++)
 +        {
 +
 +            aj = block->a[j];
 +
 +            /* Range checking */
 +            if ((aj < 0) || (aj >= natoms))
 +            {
 +                gmx_fatal(FARGS, "Invalid atom number %d in indexfile", aj);
 +            }
 +            /* Lookup up the old group number */
 +            ognr = cbuf[aj];
 +            if (ognr != NOGID)
 +            {
 +                gmx_fatal(FARGS, "Atom %d in multiple %s groups (%d and %d)",
 +                          aj+1, title, ognr+1, i+1);
 +            }
 +            else
 +            {
 +                /* Store the group number in buffer */
 +                if (grptp == egrptpONE)
 +                {
 +                    cbuf[aj] = 0;
 +                }
 +                else
 +                {
 +                    cbuf[aj] = i;
 +                }
 +                ntot++;
 +            }
 +        }
 +    }
 +
 +    /* Now check whether we have done all atoms */
 +    bRest = FALSE;
 +    if (ntot != natoms)
 +    {
 +        if (grptp == egrptpALL)
 +        {
 +            gmx_fatal(FARGS, "%d atoms are not part of any of the %s groups",
 +                      natoms-ntot, title);
 +        }
 +        else if (grptp == egrptpPART)
 +        {
 +            sprintf(warn_buf, "%d atoms are not part of any of the %s groups",
 +                    natoms-ntot, title);
 +            warning_note(wi, warn_buf);
 +        }
 +        /* Assign all atoms currently unassigned to a rest group */
 +        for (j = 0; (j < natoms); j++)
 +        {
 +            if (cbuf[j] == NOGID)
 +            {
 +                cbuf[j] = grps->nr;
 +                bRest   = TRUE;
 +            }
 +        }
 +        if (grptp != egrptpPART)
 +        {
 +            if (bVerbose)
 +            {
 +                fprintf(stderr,
 +                        "Making dummy/rest group for %s containing %d elements\n",
 +                        title, natoms-ntot);
 +            }
 +            /* Add group name "rest" */
 +            grps->nm_ind[grps->nr] = restnm;
 +
 +            /* Assign the rest name to all atoms not currently assigned to a group */
 +            for (j = 0; (j < natoms); j++)
 +            {
 +                if (cbuf[j] == NOGID)
 +                {
 +                    cbuf[j] = grps->nr;
 +                }
 +            }
 +            grps->nr++;
 +        }
 +    }
 +
 +    if (grps->nr == 1 && (ntot == 0 || ntot == natoms))
 +    {
 +        /* All atoms are part of one (or no) group, no index required */
 +        groups->ngrpnr[gtype] = 0;
 +        groups->grpnr[gtype]  = NULL;
 +    }
 +    else
 +    {
 +        groups->ngrpnr[gtype] = natoms;
 +        snew(groups->grpnr[gtype], natoms);
 +        for (j = 0; (j < natoms); j++)
 +        {
 +            groups->grpnr[gtype][j] = cbuf[j];
 +        }
 +    }
 +
 +    sfree(cbuf);
 +
 +    return (bRest && grptp == egrptpPART);
 +}
 +
 +static void calc_nrdf(gmx_mtop_t *mtop, t_inputrec *ir, char **gnames)
 +{
 +    t_grpopts              *opts;
 +    gmx_groups_t           *groups;
 +    t_pull                 *pull;
 +    int                     natoms, ai, aj, i, j, d, g, imin, jmin, nc;
 +    t_iatom                *ia;
 +    int                    *nrdf2, *na_vcm, na_tot;
 +    double                 *nrdf_tc, *nrdf_vcm, nrdf_uc, n_sub = 0;
 +    gmx_mtop_atomloop_all_t aloop;
 +    t_atom                 *atom;
 +    int                     mb, mol, ftype, as;
 +    gmx_molblock_t         *molb;
 +    gmx_moltype_t          *molt;
 +
 +    /* Calculate nrdf.
 +     * First calc 3xnr-atoms for each group
 +     * then subtract half a degree of freedom for each constraint
 +     *
 +     * Only atoms and nuclei contribute to the degrees of freedom...
 +     */
 +
 +    opts = &ir->opts;
 +
 +    groups = &mtop->groups;
 +    natoms = mtop->natoms;
 +
 +    /* Allocate one more for a possible rest group */
 +    /* We need to sum degrees of freedom into doubles,
 +     * since floats give too low nrdf's above 3 million atoms.
 +     */
 +    snew(nrdf_tc, groups->grps[egcTC].nr+1);
 +    snew(nrdf_vcm, groups->grps[egcVCM].nr+1);
 +    snew(na_vcm, groups->grps[egcVCM].nr+1);
 +
 +    for (i = 0; i < groups->grps[egcTC].nr; i++)
 +    {
 +        nrdf_tc[i] = 0;
 +    }
 +    for (i = 0; i < groups->grps[egcVCM].nr+1; i++)
 +    {
 +        nrdf_vcm[i] = 0;
 +    }
 +
 +    snew(nrdf2, natoms);
 +    aloop = gmx_mtop_atomloop_all_init(mtop);
 +    while (gmx_mtop_atomloop_all_next(aloop, &i, &atom))
 +    {
 +        nrdf2[i] = 0;
 +        if (atom->ptype == eptAtom || atom->ptype == eptNucleus)
 +        {
 +            g = ggrpnr(groups, egcFREEZE, i);
 +            /* Double count nrdf for particle i */
 +            for (d = 0; d < DIM; d++)
 +            {
 +                if (opts->nFreeze[g][d] == 0)
 +                {
 +                    nrdf2[i] += 2;
 +                }
 +            }
 +            nrdf_tc [ggrpnr(groups, egcTC, i)]  += 0.5*nrdf2[i];
 +            nrdf_vcm[ggrpnr(groups, egcVCM, i)] += 0.5*nrdf2[i];
 +        }
 +    }
 +
 +    as = 0;
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        molt = &mtop->moltype[molb->type];
 +        atom = molt->atoms.atom;
 +        for (mol = 0; mol < molb->nmol; mol++)
 +        {
 +            for (ftype = F_CONSTR; ftype <= F_CONSTRNC; ftype++)
 +            {
 +                ia = molt->ilist[ftype].iatoms;
 +                for (i = 0; i < molt->ilist[ftype].nr; )
 +                {
 +                    /* Subtract degrees of freedom for the constraints,
 +                     * if the particles still have degrees of freedom left.
 +                     * If one of the particles is a vsite or a shell, then all
 +                     * constraint motion will go there, but since they do not
 +                     * contribute to the constraints the degrees of freedom do not
 +                     * change.
 +                     */
 +                    ai = as + ia[1];
 +                    aj = as + ia[2];
 +                    if (((atom[ia[1]].ptype == eptNucleus) ||
 +                         (atom[ia[1]].ptype == eptAtom)) &&
 +                        ((atom[ia[2]].ptype == eptNucleus) ||
 +                         (atom[ia[2]].ptype == eptAtom)))
 +                    {
 +                        if (nrdf2[ai] > 0)
 +                        {
 +                            jmin = 1;
 +                        }
 +                        else
 +                        {
 +                            jmin = 2;
 +                        }
 +                        if (nrdf2[aj] > 0)
 +                        {
 +                            imin = 1;
 +                        }
 +                        else
 +                        {
 +                            imin = 2;
 +                        }
 +                        imin       = min(imin, nrdf2[ai]);
 +                        jmin       = min(jmin, nrdf2[aj]);
 +                        nrdf2[ai] -= imin;
 +                        nrdf2[aj] -= jmin;
 +                        nrdf_tc [ggrpnr(groups, egcTC, ai)]  -= 0.5*imin;
 +                        nrdf_tc [ggrpnr(groups, egcTC, aj)]  -= 0.5*jmin;
 +                        nrdf_vcm[ggrpnr(groups, egcVCM, ai)] -= 0.5*imin;
 +                        nrdf_vcm[ggrpnr(groups, egcVCM, aj)] -= 0.5*jmin;
 +                    }
 +                    ia += interaction_function[ftype].nratoms+1;
 +                    i  += interaction_function[ftype].nratoms+1;
 +                }
 +            }
 +            ia = molt->ilist[F_SETTLE].iatoms;
 +            for (i = 0; i < molt->ilist[F_SETTLE].nr; )
 +            {
 +                /* Subtract 1 dof from every atom in the SETTLE */
 +                for (j = 0; j < 3; j++)
 +                {
 +                    ai         = as + ia[1+j];
 +                    imin       = min(2, nrdf2[ai]);
 +                    nrdf2[ai] -= imin;
 +                    nrdf_tc [ggrpnr(groups, egcTC, ai)]  -= 0.5*imin;
 +                    nrdf_vcm[ggrpnr(groups, egcVCM, ai)] -= 0.5*imin;
 +                }
 +                ia += 4;
 +                i  += 4;
 +            }
 +            as += molt->atoms.nr;
 +        }
 +    }
 +
 +    if (ir->ePull == epullCONSTRAINT)
 +    {
 +        /* Correct nrdf for the COM constraints.
 +         * We correct using the TC and VCM group of the first atom
 +         * in the reference and pull group. If atoms in one pull group
 +         * belong to different TC or VCM groups it is anyhow difficult
 +         * to determine the optimal nrdf assignment.
 +         */
 +        pull = ir->pull;
 +        if (pull->eGeom == epullgPOS)
 +        {
 +            nc = 0;
 +            for (i = 0; i < DIM; i++)
 +            {
 +                if (pull->dim[i])
 +                {
 +                    nc++;
 +                }
 +            }
 +        }
 +        else
 +        {
 +            nc = 1;
 +        }
 +        for (i = 0; i < pull->ngrp; i++)
 +        {
 +            imin = 2*nc;
 +            if (pull->grp[0].nat > 0)
 +            {
 +                /* Subtract 1/2 dof from the reference group */
 +                ai = pull->grp[0].ind[0];
 +                if (nrdf_tc[ggrpnr(groups, egcTC, ai)] > 1)
 +                {
 +                    nrdf_tc [ggrpnr(groups, egcTC, ai)]  -= 0.5;
 +                    nrdf_vcm[ggrpnr(groups, egcVCM, ai)] -= 0.5;
 +                    imin--;
 +                }
 +            }
 +            /* Subtract 1/2 dof from the pulled group */
 +            ai = pull->grp[1+i].ind[0];
 +            nrdf_tc [ggrpnr(groups, egcTC, ai)]  -= 0.5*imin;
 +            nrdf_vcm[ggrpnr(groups, egcVCM, ai)] -= 0.5*imin;
 +            if (nrdf_tc[ggrpnr(groups, egcTC, ai)] < 0)
 +            {
 +                gmx_fatal(FARGS, "Center of mass pulling constraints caused the number of degrees of freedom for temperature coupling group %s to be negative", gnames[groups->grps[egcTC].nm_ind[ggrpnr(groups, egcTC, ai)]]);
 +            }
 +        }
 +    }
 +
 +    if (ir->nstcomm != 0)
 +    {
 +        /* Subtract 3 from the number of degrees of freedom in each vcm group
 +         * when com translation is removed and 6 when rotation is removed
 +         * as well.
 +         */
 +        switch (ir->comm_mode)
 +        {
 +            case ecmLINEAR:
 +                n_sub = ndof_com(ir);
 +                break;
 +            case ecmANGULAR:
 +                n_sub = 6;
 +                break;
 +            default:
 +                n_sub = 0;
 +                gmx_incons("Checking comm_mode");
 +        }
 +
 +        for (i = 0; i < groups->grps[egcTC].nr; i++)
 +        {
 +            /* Count the number of atoms of TC group i for every VCM group */
 +            for (j = 0; j < groups->grps[egcVCM].nr+1; j++)
 +            {
 +                na_vcm[j] = 0;
 +            }
 +            na_tot = 0;
 +            for (ai = 0; ai < natoms; ai++)
 +            {
 +                if (ggrpnr(groups, egcTC, ai) == i)
 +                {
 +                    na_vcm[ggrpnr(groups, egcVCM, ai)]++;
 +                    na_tot++;
 +                }
 +            }
 +            /* Correct for VCM removal according to the fraction of each VCM
 +             * group present in this TC group.
 +             */
 +            nrdf_uc = nrdf_tc[i];
 +            if (debug)
 +            {
 +                fprintf(debug, "T-group[%d] nrdf_uc = %g, n_sub = %g\n",
 +                        i, nrdf_uc, n_sub);
 +            }
 +            nrdf_tc[i] = 0;
 +            for (j = 0; j < groups->grps[egcVCM].nr+1; j++)
 +            {
 +                if (nrdf_vcm[j] > n_sub)
 +                {
 +                    nrdf_tc[i] += nrdf_uc*((double)na_vcm[j]/(double)na_tot)*
 +                        (nrdf_vcm[j] - n_sub)/nrdf_vcm[j];
 +                }
 +                if (debug)
 +                {
 +                    fprintf(debug, "  nrdf_vcm[%d] = %g, nrdf = %g\n",
 +                            j, nrdf_vcm[j], nrdf_tc[i]);
 +                }
 +            }
 +        }
 +    }
 +    for (i = 0; (i < groups->grps[egcTC].nr); i++)
 +    {
 +        opts->nrdf[i] = nrdf_tc[i];
 +        if (opts->nrdf[i] < 0)
 +        {
 +            opts->nrdf[i] = 0;
 +        }
 +        fprintf(stderr,
 +                "Number of degrees of freedom in T-Coupling group %s is %.2f\n",
 +                gnames[groups->grps[egcTC].nm_ind[i]], opts->nrdf[i]);
 +    }
 +
 +    sfree(nrdf2);
 +    sfree(nrdf_tc);
 +    sfree(nrdf_vcm);
 +    sfree(na_vcm);
 +}
 +
 +static void decode_cos(char *s, t_cosines *cosine, gmx_bool bTime)
 +{
 +    char   *t;
 +    char    format[STRLEN], f1[STRLEN];
 +    double  a, phi;
 +    int     i;
 +
 +    t = strdup(s);
 +    trim(t);
 +
 +    cosine->n   = 0;
 +    cosine->a   = NULL;
 +    cosine->phi = NULL;
 +    if (strlen(t))
 +    {
 +        sscanf(t, "%d", &(cosine->n));
 +        if (cosine->n <= 0)
 +        {
 +            cosine->n = 0;
 +        }
 +        else
 +        {
 +            snew(cosine->a, cosine->n);
 +            snew(cosine->phi, cosine->n);
 +
 +            sprintf(format, "%%*d");
 +            for (i = 0; (i < cosine->n); i++)
 +            {
 +                strcpy(f1, format);
 +                strcat(f1, "%lf%lf");
 +                if (sscanf(t, f1, &a, &phi) < 2)
 +                {
 +                    gmx_fatal(FARGS, "Invalid input for electric field shift: '%s'", t);
 +                }
 +                cosine->a[i]   = a;
 +                cosine->phi[i] = phi;
 +                strcat(format, "%*lf%*lf");
 +            }
 +        }
 +    }
 +    sfree(t);
 +}
 +
 +static gmx_bool do_egp_flag(t_inputrec *ir, gmx_groups_t *groups,
 +                            const char *option, const char *val, int flag)
 +{
 +    /* The maximum number of energy group pairs would be MAXPTR*(MAXPTR+1)/2.
 +     * But since this is much larger than STRLEN, such a line can not be parsed.
 +     * The real maximum is the number of names that fit in a string: STRLEN/2.
 +     */
 +#define EGP_MAX (STRLEN/2)
 +    int      nelem, i, j, k, nr;
 +    char    *names[EGP_MAX];
 +    char  ***gnames;
 +    gmx_bool bSet;
 +
 +    gnames = groups->grpname;
 +
 +    nelem = str_nelem(val, EGP_MAX, names);
 +    if (nelem % 2 != 0)
 +    {
 +        gmx_fatal(FARGS, "The number of groups for %s is odd", option);
 +    }
 +    nr   = groups->grps[egcENER].nr;
 +    bSet = FALSE;
 +    for (i = 0; i < nelem/2; i++)
 +    {
 +        j = 0;
 +        while ((j < nr) &&
 +               gmx_strcasecmp(names[2*i], *(gnames[groups->grps[egcENER].nm_ind[j]])))
 +        {
 +            j++;
 +        }
 +        if (j == nr)
 +        {
 +            gmx_fatal(FARGS, "%s in %s is not an energy group\n",
 +                      names[2*i], option);
 +        }
 +        k = 0;
 +        while ((k < nr) &&
 +               gmx_strcasecmp(names[2*i+1], *(gnames[groups->grps[egcENER].nm_ind[k]])))
 +        {
 +            k++;
 +        }
 +        if (k == nr)
 +        {
 +            gmx_fatal(FARGS, "%s in %s is not an energy group\n",
 +                      names[2*i+1], option);
 +        }
 +        if ((j < nr) && (k < nr))
 +        {
 +            ir->opts.egp_flags[nr*j+k] |= flag;
 +            ir->opts.egp_flags[nr*k+j] |= flag;
 +            bSet = TRUE;
 +        }
 +    }
 +
 +    return bSet;
 +}
 +
 +void do_index(const char* mdparin, const char *ndx,
 +              gmx_mtop_t *mtop,
 +              gmx_bool bVerbose,
 +              t_inputrec *ir, rvec *v,
 +              warninp_t wi)
 +{
 +    t_blocka     *grps;
 +    gmx_groups_t *groups;
 +    int           natoms;
 +    t_symtab     *symtab;
 +    t_atoms       atoms_all;
 +    char          warnbuf[STRLEN], **gnames;
 +    int           nr, ntcg, ntau_t, nref_t, nacc, nofg, nSA, nSA_points, nSA_time, nSA_temp;
 +    real          tau_min;
 +    int           nstcmin;
 +    int           nacg, nfreeze, nfrdim, nenergy, nvcm, nuser;
 +    char         *ptr1[MAXPTR], *ptr2[MAXPTR], *ptr3[MAXPTR];
 +    int           i, j, k, restnm;
 +    real          SAtime;
 +    gmx_bool      bExcl, bTable, bSetTCpar, bAnneal, bRest;
 +    int           nQMmethod, nQMbasis, nQMcharge, nQMmult, nbSH, nCASorb, nCASelec,
 +                  nSAon, nSAoff, nSAsteps, nQMg, nbOPT, nbTS;
 +    char          warn_buf[STRLEN];
 +
 +    if (bVerbose)
 +    {
 +        fprintf(stderr, "processing index file...\n");
 +    }
 +    debug_gmx();
 +    if (ndx == NULL)
 +    {
 +        snew(grps, 1);
 +        snew(grps->index, 1);
 +        snew(gnames, 1);
 +        atoms_all = gmx_mtop_global_atoms(mtop);
 +        analyse(&atoms_all, grps, &gnames, FALSE, TRUE);
 +        free_t_atoms(&atoms_all, FALSE);
 +    }
 +    else
 +    {
 +        grps = init_index(ndx, &gnames);
 +    }
 +
 +    groups = &mtop->groups;
 +    natoms = mtop->natoms;
 +    symtab = &mtop->symtab;
 +
 +    snew(groups->grpname, grps->nr+1);
 +
 +    for (i = 0; (i < grps->nr); i++)
 +    {
 +        groups->grpname[i] = put_symtab(symtab, gnames[i]);
 +    }
 +    groups->grpname[i] = put_symtab(symtab, "rest");
 +    restnm             = i;
 +    srenew(gnames, grps->nr+1);
 +    gnames[restnm]   = *(groups->grpname[i]);
 +    groups->ngrpname = grps->nr+1;
 +
 +    set_warning_line(wi, mdparin, -1);
 +
 +    ntau_t = str_nelem(tau_t, MAXPTR, ptr1);
 +    nref_t = str_nelem(ref_t, MAXPTR, ptr2);
 +    ntcg   = str_nelem(tcgrps, MAXPTR, ptr3);
 +    if ((ntau_t != ntcg) || (nref_t != ntcg))
 +    {
 +        gmx_fatal(FARGS, "Invalid T coupling input: %d groups, %d ref-t values and "
 +                  "%d tau-t values", ntcg, nref_t, ntau_t);
 +    }
 +
 +    bSetTCpar = (ir->etc || EI_SD(ir->eI) || ir->eI == eiBD || EI_TPI(ir->eI));
 +    do_numbering(natoms, groups, ntcg, ptr3, grps, gnames, egcTC,
 +                 restnm, bSetTCpar ? egrptpALL : egrptpALL_GENREST, bVerbose, wi);
 +    nr            = groups->grps[egcTC].nr;
 +    ir->opts.ngtc = nr;
 +    snew(ir->opts.nrdf, nr);
 +    snew(ir->opts.tau_t, nr);
 +    snew(ir->opts.ref_t, nr);
 +    if (ir->eI == eiBD && ir->bd_fric == 0)
 +    {
 +        fprintf(stderr, "bd-fric=0, so tau-t will be used as the inverse friction constant(s)\n");
 +    }
 +
 +    if (bSetTCpar)
 +    {
 +        if (nr != nref_t)
 +        {
 +            gmx_fatal(FARGS, "Not enough ref-t and tau-t values!");
 +        }
 +
 +        tau_min = 1e20;
 +        for (i = 0; (i < nr); i++)
 +        {
 +            ir->opts.tau_t[i] = strtod(ptr1[i], NULL);
 +            if ((ir->eI == eiBD || ir->eI == eiSD2) && ir->opts.tau_t[i] <= 0)
 +            {
 +                sprintf(warn_buf, "With integrator %s tau-t should be larger than 0", ei_names[ir->eI]);
 +                warning_error(wi, warn_buf);
 +            }
 +
 +            if (ir->etc != etcVRESCALE && ir->opts.tau_t[i] == 0)
 +            {
 +                warning_note(wi, "tau-t = -1 is the value to signal that a group should not have temperature coupling. Treating your use of tau-t = 0 as if you used -1.");
 +            }
 +
 +            if (ir->opts.tau_t[i] >= 0)
 +            {
 +                tau_min = min(tau_min, ir->opts.tau_t[i]);
 +            }
 +        }
 +        if (ir->etc != etcNO && ir->nsttcouple == -1)
 +        {
 +            ir->nsttcouple = ir_optimal_nsttcouple(ir);
 +        }
 +
 +        if (EI_VV(ir->eI))
 +        {
 +            if ((ir->etc == etcNOSEHOOVER) && (ir->epc == epcBERENDSEN))
 +            {
 +                gmx_fatal(FARGS, "Cannot do Nose-Hoover temperature with Berendsen pressure control with md-vv; use either vrescale temperature with berendsen pressure or Nose-Hoover temperature with MTTK pressure");
 +            }
 +            if ((ir->epc == epcMTTK) && (ir->etc > etcNO))
 +            {
 +                if (ir->nstpcouple != ir->nsttcouple)
 +                {
 +                    int mincouple = min(ir->nstpcouple, ir->nsttcouple);
 +                    ir->nstpcouple = ir->nsttcouple = mincouple;
 +                    sprintf(warn_buf, "for current Trotter decomposition methods with vv, nsttcouple and nstpcouple must be equal.  Both have been reset to min(nsttcouple,nstpcouple) = %d", mincouple);
 +                    warning_note(wi, warn_buf);
 +                }
 +            }
 +        }
 +        /* velocity verlet with averaged kinetic energy KE = 0.5*(v(t+1/2) - v(t-1/2)) is implemented
 +           primarily for testing purposes, and does not work with temperature coupling other than 1 */
 +
 +        if (ETC_ANDERSEN(ir->etc))
 +        {
 +            if (ir->nsttcouple != 1)
 +            {
 +                ir->nsttcouple = 1;
 +                sprintf(warn_buf, "Andersen temperature control methods assume nsttcouple = 1; there is no need for larger nsttcouple > 1, since no global parameters are computed. nsttcouple has been reset to 1");
 +                warning_note(wi, warn_buf);
 +            }
 +        }
 +        nstcmin = tcouple_min_integration_steps(ir->etc);
 +        if (nstcmin > 1)
 +        {
 +            if (tau_min/(ir->delta_t*ir->nsttcouple) < nstcmin)
 +            {
 +                sprintf(warn_buf, "For proper integration of the %s thermostat, tau-t (%g) should be at least %d times larger than nsttcouple*dt (%g)",
 +                        ETCOUPLTYPE(ir->etc),
 +                        tau_min, nstcmin,
 +                        ir->nsttcouple*ir->delta_t);
 +                warning(wi, warn_buf);
 +            }
 +        }
 +        for (i = 0; (i < nr); i++)
 +        {
 +            ir->opts.ref_t[i] = strtod(ptr2[i], NULL);
 +            if (ir->opts.ref_t[i] < 0)
 +            {
 +                gmx_fatal(FARGS, "ref-t for group %d negative", i);
 +            }
 +        }
 +        /* set the lambda mc temperature to the md integrator temperature (which should be defined
 +           if we are in this conditional) if mc_temp is negative */
 +        if (ir->expandedvals->mc_temp < 0)
 +        {
 +            ir->expandedvals->mc_temp = ir->opts.ref_t[0]; /*for now, set to the first reft */
 +        }
 +    }
 +
 +    /* Simulated annealing for each group. There are nr groups */
 +    nSA = str_nelem(anneal, MAXPTR, ptr1);
 +    if (nSA == 1 && (ptr1[0][0] == 'n' || ptr1[0][0] == 'N'))
 +    {
 +        nSA = 0;
 +    }
 +    if (nSA > 0 && nSA != nr)
 +    {
 +        gmx_fatal(FARGS, "Not enough annealing values: %d (for %d groups)\n", nSA, nr);
 +    }
 +    else
 +    {
 +        snew(ir->opts.annealing, nr);
 +        snew(ir->opts.anneal_npoints, nr);
 +        snew(ir->opts.anneal_time, nr);
 +        snew(ir->opts.anneal_temp, nr);
 +        for (i = 0; i < nr; i++)
 +        {
 +            ir->opts.annealing[i]      = eannNO;
 +            ir->opts.anneal_npoints[i] = 0;
 +            ir->opts.anneal_time[i]    = NULL;
 +            ir->opts.anneal_temp[i]    = NULL;
 +        }
 +        if (nSA > 0)
 +        {
 +            bAnneal = FALSE;
 +            for (i = 0; i < nr; i++)
 +            {
 +                if (ptr1[i][0] == 'n' || ptr1[i][0] == 'N')
 +                {
 +                    ir->opts.annealing[i] = eannNO;
 +                }
 +                else if (ptr1[i][0] == 's' || ptr1[i][0] == 'S')
 +                {
 +                    ir->opts.annealing[i] = eannSINGLE;
 +                    bAnneal               = TRUE;
 +                }
 +                else if (ptr1[i][0] == 'p' || ptr1[i][0] == 'P')
 +                {
 +                    ir->opts.annealing[i] = eannPERIODIC;
 +                    bAnneal               = TRUE;
 +                }
 +            }
 +            if (bAnneal)
 +            {
 +                /* Read the other fields too */
 +                nSA_points = str_nelem(anneal_npoints, MAXPTR, ptr1);
 +                if (nSA_points != nSA)
 +                {
 +                    gmx_fatal(FARGS, "Found %d annealing-npoints values for %d groups\n", nSA_points, nSA);
 +                }
 +                for (k = 0, i = 0; i < nr; i++)
 +                {
 +                    ir->opts.anneal_npoints[i] = strtol(ptr1[i], NULL, 10);
 +                    if (ir->opts.anneal_npoints[i] == 1)
 +                    {
 +                        gmx_fatal(FARGS, "Please specify at least a start and an end point for annealing\n");
 +                    }
 +                    snew(ir->opts.anneal_time[i], ir->opts.anneal_npoints[i]);
 +                    snew(ir->opts.anneal_temp[i], ir->opts.anneal_npoints[i]);
 +                    k += ir->opts.anneal_npoints[i];
 +                }
 +
 +                nSA_time = str_nelem(anneal_time, MAXPTR, ptr1);
 +                if (nSA_time != k)
 +                {
 +                    gmx_fatal(FARGS, "Found %d annealing-time values, wanter %d\n", nSA_time, k);
 +                }
 +                nSA_temp = str_nelem(anneal_temp, MAXPTR, ptr2);
 +                if (nSA_temp != k)
 +                {
 +                    gmx_fatal(FARGS, "Found %d annealing-temp values, wanted %d\n", nSA_temp, k);
 +                }
 +
 +                for (i = 0, k = 0; i < nr; i++)
 +                {
 +
 +                    for (j = 0; j < ir->opts.anneal_npoints[i]; j++)
 +                    {
 +                        ir->opts.anneal_time[i][j] = strtod(ptr1[k], NULL);
 +                        ir->opts.anneal_temp[i][j] = strtod(ptr2[k], NULL);
 +                        if (j == 0)
 +                        {
 +                            if (ir->opts.anneal_time[i][0] > (ir->init_t+GMX_REAL_EPS))
 +                            {
 +                                gmx_fatal(FARGS, "First time point for annealing > init_t.\n");
 +                            }
 +                        }
 +                        else
 +                        {
 +                            /* j>0 */
 +                            if (ir->opts.anneal_time[i][j] < ir->opts.anneal_time[i][j-1])
 +                            {
 +                                gmx_fatal(FARGS, "Annealing timepoints out of order: t=%f comes after t=%f\n",
 +                                          ir->opts.anneal_time[i][j], ir->opts.anneal_time[i][j-1]);
 +                            }
 +                        }
 +                        if (ir->opts.anneal_temp[i][j] < 0)
 +                        {
 +                            gmx_fatal(FARGS, "Found negative temperature in annealing: %f\n", ir->opts.anneal_temp[i][j]);
 +                        }
 +                        k++;
 +                    }
 +                }
 +                /* Print out some summary information, to make sure we got it right */
 +                for (i = 0, k = 0; i < nr; i++)
 +                {
 +                    if (ir->opts.annealing[i] != eannNO)
 +                    {
 +                        j = groups->grps[egcTC].nm_ind[i];
 +                        fprintf(stderr, "Simulated annealing for group %s: %s, %d timepoints\n",
 +                                *(groups->grpname[j]), eann_names[ir->opts.annealing[i]],
 +                                ir->opts.anneal_npoints[i]);
 +                        fprintf(stderr, "Time (ps)   Temperature (K)\n");
 +                        /* All terms except the last one */
 +                        for (j = 0; j < (ir->opts.anneal_npoints[i]-1); j++)
 +                        {
 +                            fprintf(stderr, "%9.1f      %5.1f\n", ir->opts.anneal_time[i][j], ir->opts.anneal_temp[i][j]);
 +                        }
 +
 +                        /* Finally the last one */
 +                        j = ir->opts.anneal_npoints[i]-1;
 +                        if (ir->opts.annealing[i] == eannSINGLE)
 +                        {
 +                            fprintf(stderr, "%9.1f-     %5.1f\n", ir->opts.anneal_time[i][j], ir->opts.anneal_temp[i][j]);
 +                        }
 +                        else
 +                        {
 +                            fprintf(stderr, "%9.1f      %5.1f\n", ir->opts.anneal_time[i][j], ir->opts.anneal_temp[i][j]);
 +                            if (fabs(ir->opts.anneal_temp[i][j]-ir->opts.anneal_temp[i][0]) > GMX_REAL_EPS)
 +                            {
 +                                warning_note(wi, "There is a temperature jump when your annealing loops back.\n");
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    if (ir->ePull != epullNO)
 +    {
 +        make_pull_groups(ir->pull, pull_grp, grps, gnames);
 +    }
 +
 +    if (ir->bRot)
 +    {
 +        make_rotation_groups(ir->rot, rot_grp, grps, gnames);
 +    }
 +
 +    nacc = str_nelem(acc, MAXPTR, ptr1);
 +    nacg = str_nelem(accgrps, MAXPTR, ptr2);
 +    if (nacg*DIM != nacc)
 +    {
 +        gmx_fatal(FARGS, "Invalid Acceleration input: %d groups and %d acc. values",
 +                  nacg, nacc);
 +    }
 +    do_numbering(natoms, groups, nacg, ptr2, grps, gnames, egcACC,
 +                 restnm, egrptpALL_GENREST, bVerbose, wi);
 +    nr = groups->grps[egcACC].nr;
 +    snew(ir->opts.acc, nr);
 +    ir->opts.ngacc = nr;
 +
 +    for (i = k = 0; (i < nacg); i++)
 +    {
 +        for (j = 0; (j < DIM); j++, k++)
 +        {
 +            ir->opts.acc[i][j] = strtod(ptr1[k], NULL);
 +        }
 +    }
 +    for (; (i < nr); i++)
 +    {
 +        for (j = 0; (j < DIM); j++)
 +        {
 +            ir->opts.acc[i][j] = 0;
 +        }
 +    }
 +
 +    nfrdim  = str_nelem(frdim, MAXPTR, ptr1);
 +    nfreeze = str_nelem(freeze, MAXPTR, ptr2);
 +    if (nfrdim != DIM*nfreeze)
 +    {
 +        gmx_fatal(FARGS, "Invalid Freezing input: %d groups and %d freeze values",
 +                  nfreeze, nfrdim);
 +    }
 +    do_numbering(natoms, groups, nfreeze, ptr2, grps, gnames, egcFREEZE,
 +                 restnm, egrptpALL_GENREST, bVerbose, wi);
 +    nr             = groups->grps[egcFREEZE].nr;
 +    ir->opts.ngfrz = nr;
 +    snew(ir->opts.nFreeze, nr);
 +    for (i = k = 0; (i < nfreeze); i++)
 +    {
 +        for (j = 0; (j < DIM); j++, k++)
 +        {
 +            ir->opts.nFreeze[i][j] = (gmx_strncasecmp(ptr1[k], "Y", 1) == 0);
 +            if (!ir->opts.nFreeze[i][j])
 +            {
 +                if (gmx_strncasecmp(ptr1[k], "N", 1) != 0)
 +                {
 +                    sprintf(warnbuf, "Please use Y(ES) or N(O) for freezedim only "
 +                            "(not %s)", ptr1[k]);
 +                    warning(wi, warn_buf);
 +                }
 +            }
 +        }
 +    }
 +    for (; (i < nr); i++)
 +    {
 +        for (j = 0; (j < DIM); j++)
 +        {
 +            ir->opts.nFreeze[i][j] = 0;
 +        }
 +    }
 +
 +    nenergy = str_nelem(energy, MAXPTR, ptr1);
 +    do_numbering(natoms, groups, nenergy, ptr1, grps, gnames, egcENER,
 +                 restnm, egrptpALL_GENREST, bVerbose, wi);
 +    add_wall_energrps(groups, ir->nwall, symtab);
 +    ir->opts.ngener = groups->grps[egcENER].nr;
 +    nvcm            = str_nelem(vcm, MAXPTR, ptr1);
 +    bRest           =
 +        do_numbering(natoms, groups, nvcm, ptr1, grps, gnames, egcVCM,
 +                     restnm, nvcm == 0 ? egrptpALL_GENREST : egrptpPART, bVerbose, wi);
 +    if (bRest)
 +    {
 +        warning(wi, "Some atoms are not part of any center of mass motion removal group.\n"
 +                "This may lead to artifacts.\n"
 +                "In most cases one should use one group for the whole system.");
 +    }
 +
 +    /* Now we have filled the freeze struct, so we can calculate NRDF */
 +    calc_nrdf(mtop, ir, gnames);
 +
 +    if (v && NULL)
 +    {
 +        real fac, ntot = 0;
 +
 +        /* Must check per group! */
 +        for (i = 0; (i < ir->opts.ngtc); i++)
 +        {
 +            ntot += ir->opts.nrdf[i];
 +        }
 +        if (ntot != (DIM*natoms))
 +        {
 +            fac = sqrt(ntot/(DIM*natoms));
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "Scaling velocities by a factor of %.3f to account for constraints\n"
 +                        "and removal of center of mass motion\n", fac);
 +            }
 +            for (i = 0; (i < natoms); i++)
 +            {
 +                svmul(fac, v[i], v[i]);
 +            }
 +        }
 +    }
 +
 +    nuser = str_nelem(user1, MAXPTR, ptr1);
 +    do_numbering(natoms, groups, nuser, ptr1, grps, gnames, egcUser1,
 +                 restnm, egrptpALL_GENREST, bVerbose, wi);
 +    nuser = str_nelem(user2, MAXPTR, ptr1);
 +    do_numbering(natoms, groups, nuser, ptr1, grps, gnames, egcUser2,
 +                 restnm, egrptpALL_GENREST, bVerbose, wi);
 +    nuser = str_nelem(xtc_grps, MAXPTR, ptr1);
 +    do_numbering(natoms, groups, nuser, ptr1, grps, gnames, egcXTC,
 +                 restnm, egrptpONE, bVerbose, wi);
 +    nofg = str_nelem(orirefitgrp, MAXPTR, ptr1);
 +    do_numbering(natoms, groups, nofg, ptr1, grps, gnames, egcORFIT,
 +                 restnm, egrptpALL_GENREST, bVerbose, wi);
 +
 +    /* QMMM input processing */
 +    nQMg          = str_nelem(QMMM, MAXPTR, ptr1);
 +    nQMmethod     = str_nelem(QMmethod, MAXPTR, ptr2);
 +    nQMbasis      = str_nelem(QMbasis, MAXPTR, ptr3);
 +    if ((nQMmethod != nQMg) || (nQMbasis != nQMg))
 +    {
 +        gmx_fatal(FARGS, "Invalid QMMM input: %d groups %d basissets"
 +                  " and %d methods\n", nQMg, nQMbasis, nQMmethod);
 +    }
 +    /* group rest, if any, is always MM! */
 +    do_numbering(natoms, groups, nQMg, ptr1, grps, gnames, egcQMMM,
 +                 restnm, egrptpALL_GENREST, bVerbose, wi);
 +    nr            = nQMg; /*atoms->grps[egcQMMM].nr;*/
 +    ir->opts.ngQM = nQMg;
 +    snew(ir->opts.QMmethod, nr);
 +    snew(ir->opts.QMbasis, nr);
 +    for (i = 0; i < nr; i++)
 +    {
 +        /* input consists of strings: RHF CASSCF PM3 .. These need to be
 +         * converted to the corresponding enum in names.c
 +         */
 +        ir->opts.QMmethod[i] = search_QMstring(ptr2[i], eQMmethodNR,
 +                                               eQMmethod_names);
 +        ir->opts.QMbasis[i]  = search_QMstring(ptr3[i], eQMbasisNR,
 +                                               eQMbasis_names);
 +
 +    }
 +    nQMmult   = str_nelem(QMmult, MAXPTR, ptr1);
 +    nQMcharge = str_nelem(QMcharge, MAXPTR, ptr2);
 +    nbSH      = str_nelem(bSH, MAXPTR, ptr3);
 +    snew(ir->opts.QMmult, nr);
 +    snew(ir->opts.QMcharge, nr);
 +    snew(ir->opts.bSH, nr);
 +
 +    for (i = 0; i < nr; i++)
 +    {
 +        ir->opts.QMmult[i]   = strtol(ptr1[i], NULL, 10);
 +        ir->opts.QMcharge[i] = strtol(ptr2[i], NULL, 10);
 +        ir->opts.bSH[i]      = (gmx_strncasecmp(ptr3[i], "Y", 1) == 0);
 +    }
 +
 +    nCASelec  = str_nelem(CASelectrons, MAXPTR, ptr1);
 +    nCASorb   = str_nelem(CASorbitals, MAXPTR, ptr2);
 +    snew(ir->opts.CASelectrons, nr);
 +    snew(ir->opts.CASorbitals, nr);
 +    for (i = 0; i < nr; i++)
 +    {
 +        ir->opts.CASelectrons[i] = strtol(ptr1[i], NULL, 10);
 +        ir->opts.CASorbitals[i]  = strtol(ptr2[i], NULL, 10);
 +    }
 +    /* special optimization options */
 +
 +    nbOPT = str_nelem(bOPT, MAXPTR, ptr1);
 +    nbTS  = str_nelem(bTS, MAXPTR, ptr2);
 +    snew(ir->opts.bOPT, nr);
 +    snew(ir->opts.bTS, nr);
 +    for (i = 0; i < nr; i++)
 +    {
 +        ir->opts.bOPT[i] = (gmx_strncasecmp(ptr1[i], "Y", 1) == 0);
 +        ir->opts.bTS[i]  = (gmx_strncasecmp(ptr2[i], "Y", 1) == 0);
 +    }
 +    nSAon     = str_nelem(SAon, MAXPTR, ptr1);
 +    nSAoff    = str_nelem(SAoff, MAXPTR, ptr2);
 +    nSAsteps  = str_nelem(SAsteps, MAXPTR, ptr3);
 +    snew(ir->opts.SAon, nr);
 +    snew(ir->opts.SAoff, nr);
 +    snew(ir->opts.SAsteps, nr);
 +
 +    for (i = 0; i < nr; i++)
 +    {
 +        ir->opts.SAon[i]    = strtod(ptr1[i], NULL);
 +        ir->opts.SAoff[i]   = strtod(ptr2[i], NULL);
 +        ir->opts.SAsteps[i] = strtol(ptr3[i], NULL, 10);
 +    }
 +    /* end of QMMM input */
 +
 +    if (bVerbose)
 +    {
 +        for (i = 0; (i < egcNR); i++)
 +        {
 +            fprintf(stderr, "%-16s has %d element(s):", gtypes[i], groups->grps[i].nr);
 +            for (j = 0; (j < groups->grps[i].nr); j++)
 +            {
 +                fprintf(stderr, " %s", *(groups->grpname[groups->grps[i].nm_ind[j]]));
 +            }
 +            fprintf(stderr, "\n");
 +        }
 +    }
 +
 +    nr = groups->grps[egcENER].nr;
 +    snew(ir->opts.egp_flags, nr*nr);
 +
 +    bExcl = do_egp_flag(ir, groups, "energygrp-excl", egpexcl, EGP_EXCL);
 +    if (bExcl && ir->cutoff_scheme == ecutsVERLET)
 +    {
 +        warning_error(wi, "Energy group exclusions are not (yet) implemented for the Verlet scheme");
 +    }
 +    if (bExcl && EEL_FULL(ir->coulombtype))
 +    {
 +        warning(wi, "Can not exclude the lattice Coulomb energy between energy groups");
 +    }
 +
 +    bTable = do_egp_flag(ir, groups, "energygrp-table", egptable, EGP_TABLE);
 +    if (bTable && !(ir->vdwtype == evdwUSER) &&
 +        !(ir->coulombtype == eelUSER) && !(ir->coulombtype == eelPMEUSER) &&
 +        !(ir->coulombtype == eelPMEUSERSWITCH))
 +    {
 +        gmx_fatal(FARGS, "Can only have energy group pair tables in combination with user tables for VdW and/or Coulomb");
 +    }
 +
 +    decode_cos(efield_x, &(ir->ex[XX]), FALSE);
 +    decode_cos(efield_xt, &(ir->et[XX]), TRUE);
 +    decode_cos(efield_y, &(ir->ex[YY]), FALSE);
 +    decode_cos(efield_yt, &(ir->et[YY]), TRUE);
 +    decode_cos(efield_z, &(ir->ex[ZZ]), FALSE);
 +    decode_cos(efield_zt, &(ir->et[ZZ]), TRUE);
 +
 +    if (ir->bAdress)
 +    {
 +        do_adress_index(ir->adress, groups, gnames, &(ir->opts), wi);
 +    }
 +
 +    for (i = 0; (i < grps->nr); i++)
 +    {
 +        sfree(gnames[i]);
 +    }
 +    sfree(gnames);
 +    done_blocka(grps);
 +    sfree(grps);
 +
 +}
 +
 +
 +
 +static void check_disre(gmx_mtop_t *mtop)
 +{
 +    gmx_ffparams_t *ffparams;
 +    t_functype     *functype;
 +    t_iparams      *ip;
 +    int             i, ndouble, ftype;
 +    int             label, old_label;
 +
 +    if (gmx_mtop_ftype_count(mtop, F_DISRES) > 0)
 +    {
 +        ffparams  = &mtop->ffparams;
 +        functype  = ffparams->functype;
 +        ip        = ffparams->iparams;
 +        ndouble   = 0;
 +        old_label = -1;
 +        for (i = 0; i < ffparams->ntypes; i++)
 +        {
 +            ftype = functype[i];
 +            if (ftype == F_DISRES)
 +            {
 +                label = ip[i].disres.label;
 +                if (label == old_label)
 +                {
 +                    fprintf(stderr, "Distance restraint index %d occurs twice\n", label);
 +                    ndouble++;
 +                }
 +                old_label = label;
 +            }
 +        }
 +        if (ndouble > 0)
 +        {
 +            gmx_fatal(FARGS, "Found %d double distance restraint indices,\n"
 +                      "probably the parameters for multiple pairs in one restraint "
 +                      "are not identical\n", ndouble);
 +        }
 +    }
 +}
 +
 +static gmx_bool absolute_reference(t_inputrec *ir, gmx_mtop_t *sys,
 +                                   gmx_bool posres_only,
 +                                   ivec AbsRef)
 +{
 +    int                  d, g, i;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist             *ilist;
 +    int                  nmol;
 +    t_iparams           *pr;
 +
 +    clear_ivec(AbsRef);
 +
 +    if (!posres_only)
 +    {
 +        /* Check the COM */
 +        for (d = 0; d < DIM; d++)
 +        {
 +            AbsRef[d] = (d < ndof_com(ir) ? 0 : 1);
 +        }
 +        /* Check for freeze groups */
 +        for (g = 0; g < ir->opts.ngfrz; g++)
 +        {
 +            for (d = 0; d < DIM; d++)
 +            {
 +                if (ir->opts.nFreeze[g][d] != 0)
 +                {
 +                    AbsRef[d] = 1;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Check for position restraints */
 +    iloop = gmx_mtop_ilistloop_init(sys);
 +    while (gmx_mtop_ilistloop_next(iloop, &ilist, &nmol))
 +    {
 +        if (nmol > 0 &&
 +            (AbsRef[XX] == 0 || AbsRef[YY] == 0 || AbsRef[ZZ] == 0))
 +        {
 +            for (i = 0; i < ilist[F_POSRES].nr; i += 2)
 +            {
 +                pr = &sys->ffparams.iparams[ilist[F_POSRES].iatoms[i]];
 +                for (d = 0; d < DIM; d++)
 +                {
 +                    if (pr->posres.fcA[d] != 0)
 +                    {
 +                        AbsRef[d] = 1;
 +                    }
 +                }
 +            }
 +            for (i = 0; i < ilist[F_FBPOSRES].nr; i += 2)
 +            {
 +                /* Check for flat-bottom posres */
 +                pr = &sys->ffparams.iparams[ilist[F_FBPOSRES].iatoms[i]];
 +                if (pr->fbposres.k != 0)
 +                {
 +                    switch (pr->fbposres.geom)
 +                    {
 +                        case efbposresSPHERE:
 +                            AbsRef[XX] = AbsRef[YY] = AbsRef[ZZ] = 1;
 +                            break;
 +                        case efbposresCYLINDER:
 +                            AbsRef[XX] = AbsRef[YY] = 1;
 +                            break;
 +                        case efbposresX: /* d=XX */
 +                        case efbposresY: /* d=YY */
 +                        case efbposresZ: /* d=ZZ */
 +                            d         = pr->fbposres.geom - efbposresX;
 +                            AbsRef[d] = 1;
 +                            break;
 +                        default:
 +                            gmx_fatal(FARGS, " Invalid geometry for flat-bottom position restraint.\n"
 +                                      "Expected nr between 1 and %d. Found %d\n", efbposresNR-1,
 +                                      pr->fbposres.geom);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    return (AbsRef[XX] != 0 && AbsRef[YY] != 0 && AbsRef[ZZ] != 0);
 +}
 +
 +void triple_check(const char *mdparin, t_inputrec *ir, gmx_mtop_t *sys,
 +                  warninp_t wi)
 +{
 +    char                      err_buf[256];
 +    int                       i, m, g, nmol, npct;
 +    gmx_bool                  bCharge, bAcc;
 +    real                      gdt_max, *mgrp, mt;
 +    rvec                      acc;
 +    gmx_mtop_atomloop_block_t aloopb;
 +    gmx_mtop_atomloop_all_t   aloop;
 +    t_atom                   *atom;
 +    ivec                      AbsRef;
 +    char                      warn_buf[STRLEN];
 +
 +    set_warning_line(wi, mdparin, -1);
 +
 +    if (EI_DYNAMICS(ir->eI) && !EI_SD(ir->eI) && ir->eI != eiBD &&
 +        ir->comm_mode == ecmNO &&
 +        !(absolute_reference(ir, sys, FALSE, AbsRef) || ir->nsteps <= 10))
 +    {
 +        warning(wi, "You are not using center of mass motion removal (mdp option comm-mode), numerical rounding errors can lead to build up of kinetic energy of the center of mass");
 +    }
 +
 +    /* Check for pressure coupling with absolute position restraints */
 +    if (ir->epc != epcNO && ir->refcoord_scaling == erscNO)
 +    {
 +        absolute_reference(ir, sys, TRUE, AbsRef);
 +        {
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (AbsRef[m] && norm2(ir->compress[m]) > 0)
 +                {
 +                    warning(wi, "You are using pressure coupling with absolute position restraints, this will give artifacts. Use the refcoord_scaling option.");
 +                    break;
 +                }
 +            }
 +        }
 +    }
 +
 +    bCharge = FALSE;
 +    aloopb  = gmx_mtop_atomloop_block_init(sys);
 +    while (gmx_mtop_atomloop_block_next(aloopb, &atom, &nmol))
 +    {
 +        if (atom->q != 0 || atom->qB != 0)
 +        {
 +            bCharge = TRUE;
 +        }
 +    }
 +
 +    if (!bCharge)
 +    {
 +        if (EEL_FULL(ir->coulombtype))
 +        {
 +            sprintf(err_buf,
 +                    "You are using full electrostatics treatment %s for a system without charges.\n"
 +                    "This costs a lot of performance for just processing zeros, consider using %s instead.\n",
 +                    EELTYPE(ir->coulombtype), EELTYPE(eelCUT));
 +            warning(wi, err_buf);
 +        }
 +    }
 +    else
 +    {
 +        if (ir->coulombtype == eelCUT && ir->rcoulomb > 0 && !ir->implicit_solvent)
 +        {
 +            sprintf(err_buf,
 +                    "You are using a plain Coulomb cut-off, which might produce artifacts.\n"
 +                    "You might want to consider using %s electrostatics.\n",
 +                    EELTYPE(eelPME));
 +            warning_note(wi, err_buf);
 +        }
 +    }
 +
 +    /* Generalized reaction field */
 +    if (ir->opts.ngtc == 0)
 +    {
 +        sprintf(err_buf, "No temperature coupling while using coulombtype %s",
 +                eel_names[eelGRF]);
 +        CHECK(ir->coulombtype == eelGRF);
 +    }
 +    else
 +    {
 +        sprintf(err_buf, "When using coulombtype = %s"
 +                " ref-t for temperature coupling should be > 0",
 +                eel_names[eelGRF]);
 +        CHECK((ir->coulombtype == eelGRF) && (ir->opts.ref_t[0] <= 0));
 +    }
 +
 +    if (ir->eI == eiSD1 &&
 +        (gmx_mtop_ftype_count(sys, F_CONSTR) > 0 ||
 +         gmx_mtop_ftype_count(sys, F_SETTLE) > 0))
 +    {
 +        sprintf(warn_buf, "With constraints integrator %s is less accurate, consider using %s instead", ei_names[ir->eI], ei_names[eiSD2]);
 +        warning_note(wi, warn_buf);
 +    }
 +
 +    bAcc = FALSE;
 +    for (i = 0; (i < sys->groups.grps[egcACC].nr); i++)
 +    {
 +        for (m = 0; (m < DIM); m++)
 +        {
 +            if (fabs(ir->opts.acc[i][m]) > 1e-6)
 +            {
 +                bAcc = TRUE;
 +            }
 +        }
 +    }
 +    if (bAcc)
 +    {
 +        clear_rvec(acc);
 +        snew(mgrp, sys->groups.grps[egcACC].nr);
 +        aloop = gmx_mtop_atomloop_all_init(sys);
 +        while (gmx_mtop_atomloop_all_next(aloop, &i, &atom))
 +        {
 +            mgrp[ggrpnr(&sys->groups, egcACC, i)] += atom->m;
 +        }
 +        mt = 0.0;
 +        for (i = 0; (i < sys->groups.grps[egcACC].nr); i++)
 +        {
 +            for (m = 0; (m < DIM); m++)
 +            {
 +                acc[m] += ir->opts.acc[i][m]*mgrp[i];
 +            }
 +            mt += mgrp[i];
 +        }
 +        for (m = 0; (m < DIM); m++)
 +        {
 +            if (fabs(acc[m]) > 1e-6)
 +            {
 +                const char *dim[DIM] = { "X", "Y", "Z" };
 +                fprintf(stderr,
 +                        "Net Acceleration in %s direction, will %s be corrected\n",
 +                        dim[m], ir->nstcomm != 0 ? "" : "not");
 +                if (ir->nstcomm != 0 && m < ndof_com(ir))
 +                {
 +                    acc[m] /= mt;
 +                    for (i = 0; (i < sys->groups.grps[egcACC].nr); i++)
 +                    {
 +                        ir->opts.acc[i][m] -= acc[m];
 +                    }
 +                }
 +            }
 +        }
 +        sfree(mgrp);
 +    }
 +
 +    if (ir->efep != efepNO && ir->fepvals->sc_alpha != 0 &&
 +        !gmx_within_tol(sys->ffparams.reppow, 12.0, 10*GMX_DOUBLE_EPS))
 +    {
 +        gmx_fatal(FARGS, "Soft-core interactions are only supported with VdW repulsion power 12");
 +    }
 +
 +    if (ir->ePull != epullNO)
 +    {
 +        if (ir->pull->grp[0].nat == 0)
 +        {
 +            absolute_reference(ir, sys, FALSE, AbsRef);
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (ir->pull->dim[m] && !AbsRef[m])
 +                {
 +                    warning(wi, "You are using an absolute reference for pulling, but the rest of the system does not have an absolute reference. This will lead to artifacts.");
 +                    break;
 +                }
 +            }
 +        }
 +
 +        if (ir->pull->eGeom == epullgDIRPBC)
 +        {
 +            for (i = 0; i < 3; i++)
 +            {
 +                for (m = 0; m <= i; m++)
 +                {
 +                    if ((ir->epc != epcNO && ir->compress[i][m] != 0) ||
 +                        ir->deform[i][m] != 0)
 +                    {
 +                        for (g = 1; g < ir->pull->ngrp; g++)
 +                        {
 +                            if (ir->pull->grp[g].vec[m] != 0)
 +                            {
 +                                gmx_fatal(FARGS, "Can not have dynamic box while using pull geometry '%s' (dim %c)", EPULLGEOM(ir->pull->eGeom), 'x'+m);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    check_disre(sys);
 +}
 +
 +void double_check(t_inputrec *ir, matrix box, gmx_bool bConstr, warninp_t wi)
 +{
 +    real        min_size;
 +    gmx_bool    bTWIN;
 +    char        warn_buf[STRLEN];
 +    const char *ptr;
 +
 +    ptr = check_box(ir->ePBC, box);
 +    if (ptr)
 +    {
 +        warning_error(wi, ptr);
 +    }
 +
 +    if (bConstr && ir->eConstrAlg == econtSHAKE)
 +    {
 +        if (ir->shake_tol <= 0.0)
 +        {
 +            sprintf(warn_buf, "ERROR: shake-tol must be > 0 instead of %g\n",
 +                    ir->shake_tol);
 +            warning_error(wi, warn_buf);
 +        }
 +
 +        if (IR_TWINRANGE(*ir) && ir->nstlist > 1)
 +        {
 +            sprintf(warn_buf, "With twin-range cut-off's and SHAKE the virial and the pressure are incorrect.");
 +            if (ir->epc == epcNO)
 +            {
 +                warning(wi, warn_buf);
 +            }
 +            else
 +            {
 +                warning_error(wi, warn_buf);
 +            }
 +        }
 +    }
 +
 +    if ( (ir->eConstrAlg == econtLINCS) && bConstr)
 +    {
 +        /* If we have Lincs constraints: */
 +        if (ir->eI == eiMD && ir->etc == etcNO &&
 +            ir->eConstrAlg == econtLINCS && ir->nLincsIter == 1)
 +        {
 +            sprintf(warn_buf, "For energy conservation with LINCS, lincs_iter should be 2 or larger.\n");
 +            warning_note(wi, warn_buf);
 +        }
 +
 +        if ((ir->eI == eiCG || ir->eI == eiLBFGS) && (ir->nProjOrder < 8))
 +        {
 +            sprintf(warn_buf, "For accurate %s with LINCS constraints, lincs-order should be 8 or more.", ei_names[ir->eI]);
 +            warning_note(wi, warn_buf);
 +        }
 +        if (ir->epc == epcMTTK)
 +        {
 +            warning_error(wi, "MTTK not compatible with lincs -- use shake instead.");
 +        }
 +    }
 +
 +    if (ir->LincsWarnAngle > 90.0)
 +    {
 +        sprintf(warn_buf, "lincs-warnangle can not be larger than 90 degrees, setting it to 90.\n");
 +        warning(wi, warn_buf);
 +        ir->LincsWarnAngle = 90.0;
 +    }
 +
 +    if (ir->ePBC != epbcNONE)
 +    {
 +        if (ir->nstlist == 0)
 +        {
 +            warning(wi, "With nstlist=0 atoms are only put into the box at step 0, therefore drifting atoms might cause the simulation to crash.");
 +        }
 +        bTWIN = (ir->rlistlong > ir->rlist);
 +        if (ir->ns_type == ensGRID)
 +        {
 +            if (sqr(ir->rlistlong) >= max_cutoff2(ir->ePBC, box))
 +            {
 +                sprintf(warn_buf, "ERROR: The cut-off length is longer than half the shortest box vector or longer than the smallest box diagonal element. Increase the box size or decrease %s.\n",
 +                        bTWIN ? (ir->rcoulomb == ir->rlistlong ? "rcoulomb" : "rvdw") : "rlist");
 +                warning_error(wi, warn_buf);
 +            }
 +        }
 +        else
 +        {
 +            min_size = min(box[XX][XX], min(box[YY][YY], box[ZZ][ZZ]));
 +            if (2*ir->rlistlong >= min_size)
 +            {
 +                sprintf(warn_buf, "ERROR: One of the box lengths is smaller than twice the cut-off length. Increase the box size or decrease rlist.");
 +                warning_error(wi, warn_buf);
 +                if (TRICLINIC(box))
 +                {
 +                    fprintf(stderr, "Grid search might allow larger cut-off's than simple search with triclinic boxes.");
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void check_chargegroup_radii(const gmx_mtop_t *mtop, const t_inputrec *ir,
 +                             rvec *x,
 +                             warninp_t wi)
 +{
 +    real rvdw1, rvdw2, rcoul1, rcoul2;
 +    char warn_buf[STRLEN];
 +
 +    calc_chargegroup_radii(mtop, x, &rvdw1, &rvdw2, &rcoul1, &rcoul2);
 +
 +    if (rvdw1 > 0)
 +    {
 +        printf("Largest charge group radii for Van der Waals: %5.3f, %5.3f nm\n",
 +               rvdw1, rvdw2);
 +    }
 +    if (rcoul1 > 0)
 +    {
 +        printf("Largest charge group radii for Coulomb:       %5.3f, %5.3f nm\n",
 +               rcoul1, rcoul2);
 +    }
 +
 +    if (ir->rlist > 0)
 +    {
 +        if (rvdw1  + rvdw2  > ir->rlist ||
 +            rcoul1 + rcoul2 > ir->rlist)
 +        {
-             if (EVDW_IS_ZERO_AT_CUTOFF(ir->vdwtype) &&
++            sprintf(warn_buf,
++                    "The sum of the two largest charge group radii (%f) "
++                    "is larger than rlist (%f)\n",
++                    max(rvdw1+rvdw2, rcoul1+rcoul2), ir->rlist);
 +            warning(wi, warn_buf);
 +        }
 +        else
 +        {
 +            /* Here we do not use the zero at cut-off macro,
 +             * since user defined interactions might purposely
 +             * not be zero at the cut-off.
 +             */
-                 sprintf(warn_buf, "The sum of the two largest charge group radii (%f) is larger than %s (%f) - rvdw (%f)\n",
++            if ((EVDW_IS_ZERO_AT_CUTOFF(ir->vdwtype) ||
++                 ir->vdw_modifier != eintmodNONE) &&
 +                rvdw1 + rvdw2 > ir->rlistlong - ir->rvdw)
 +            {
-                         ir->rlistlong, ir->rvdw);
++                sprintf(warn_buf, "The sum of the two largest charge group "
++                        "radii (%f) is larger than %s (%f) - rvdw (%f).\n"
++                        "With exact cut-offs, better performance can be "
++                        "obtained with cutoff-scheme = %s, because it "
++                        "does not use charge groups at all.",
 +                        rvdw1+rvdw2,
 +                        ir->rlistlong > ir->rlist ? "rlistlong" : "rlist",
-             if (EEL_IS_ZERO_AT_CUTOFF(ir->coulombtype) &&
++                        ir->rlistlong, ir->rvdw,
++                        ecutscheme_names[ecutsVERLET]);
 +                if (ir_NVE(ir))
 +                {
 +                    warning(wi, warn_buf);
 +                }
 +                else
 +                {
 +                    warning_note(wi, warn_buf);
 +                }
 +            }
-                 sprintf(warn_buf, "The sum of the two largest charge group radii (%f) is larger than %s (%f) - rcoulomb (%f)\n",
++            if ((EEL_IS_ZERO_AT_CUTOFF(ir->coulombtype) ||
++                 ir->coulomb_modifier != eintmodNONE) &&
 +                rcoul1 + rcoul2 > ir->rlistlong - ir->rcoulomb)
 +            {
-                         ir->rlistlong, ir->rcoulomb);
++                sprintf(warn_buf, "The sum of the two largest charge group radii (%f) is larger than %s (%f) - rcoulomb (%f).\n"
++                        "With exact cut-offs, better performance can be obtained with cutoff-scheme = %s, because it does not use charge groups at all.",
 +                        rcoul1+rcoul2,
 +                        ir->rlistlong > ir->rlist ? "rlistlong" : "rlist",
++                        ir->rlistlong, ir->rcoulomb,
++                        ecutscheme_names[ecutsVERLET]);
 +                if (ir_NVE(ir))
 +                {
 +                    warning(wi, warn_buf);
 +                }
 +                else
 +                {
 +                    warning_note(wi, warn_buf);
 +                }
 +            }
 +        }
 +    }
 +}
index e317d9ee6e1f04bd1721179f6b62aee9ee758592,0000000000000000000000000000000000000000..fdbf854dc4a697cab9a7be52d59bbfdd73abbfc2
mode 100644,000000..100644
--- /dev/null
@@@ -1,295 -1,0 +1,298 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + * This file is part of GROMACS.
 + * Copyright (c) 2012-
 + *
 + * Written by the Gromacs development team under coordination of
 + * David van der Spoel, Berk Hess, and Erik Lindahl.
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + *
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +#ifndef GMX_CPUID_H_
 +#define GMX_CPUID_H_
 +
 +#include <stdio.h>
 +
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +#if 0
 +} /* fixes auto-indentation problems */
 +#endif
 +
 +
 +/* Currently identifiable CPU Vendors */
 +enum gmx_cpuid_vendor
 +{
 +    GMX_CPUID_VENDOR_CANNOTDETECT,   /* Should only be used if something fails */
 +    GMX_CPUID_VENDOR_UNKNOWN,
 +    GMX_CPUID_VENDOR_INTEL,
 +    GMX_CPUID_VENDOR_AMD,
++    GMX_CPUID_VENDOR_FUJITSU,
++    GMX_CPUID_VENDOR_IBM,
 +    GMX_CPUID_NVENDORS
 +};
 +
 +
 +/* CPU feature/property list, to be used as indices into the feature array of the
 + * gmxcpuid_t data structure.
 + *
 + * To facilitate looking things up, we keep this list alphabetical.
 + * The list is NOT exhaustive - we have basically added stuff that might be
 + * useful in an application like Gromacs.
 + *
 + * AMD and Intel tend to share most architectural elements, and even if the
 + * flags might have to be detected in different ways (different cpuid registers),
 + * once the flag is present the functions should be identical. Unfortunately the
 + * trend right now (2012) seems to be that they are diverging. This means that
 + * we need to use specific flags to the compiler to maximize performance, and
 + * then the binaries might not be portable between Intel and AMD as they were
 + * before when we only needed to check for SSE and/or SSE2 support in Gromacs.
 + */
 +enum gmx_cpuid_feature
 +{
 +    GMX_CPUID_FEATURE_CANNOTDETECT,      /* Flag set if we could not detect on this CPU  */
 +    GMX_CPUID_FEATURE_X86_AES,           /* x86 advanced encryption standard accel.      */
 +    GMX_CPUID_FEATURE_X86_APIC,          /* APIC support                                 */
 +    GMX_CPUID_FEATURE_X86_AVX,           /* Advanced vector extensions                   */
 +    GMX_CPUID_FEATURE_X86_AVX2,          /* AVX2 including gather support (not used yet) */
 +    GMX_CPUID_FEATURE_X86_CLFSH,         /* Supports CLFLUSH instruction                 */
 +    GMX_CPUID_FEATURE_X86_CMOV,          /* Conditional move insn support                */
 +    GMX_CPUID_FEATURE_X86_CX8,           /* Supports CMPXCHG8B (8-byte compare-exchange) */
 +    GMX_CPUID_FEATURE_X86_CX16,          /* Supports CMPXCHG16B (16-byte compare-exchg)  */
 +    GMX_CPUID_FEATURE_X86_F16C,          /* Supports 16-bit FP conversion instructions   */
 +    GMX_CPUID_FEATURE_X86_FMA,           /* Fused-multiply add support (mainly for AVX)  */
 +    GMX_CPUID_FEATURE_X86_FMA4,          /* 4-operand FMA, only on AMD for now           */
 +    GMX_CPUID_FEATURE_X86_HTT,           /* Hyper-Threading supported                    */
 +    GMX_CPUID_FEATURE_X86_LAHF_LM,       /* LAHF/SAHF support in 64 bits                 */
 +    GMX_CPUID_FEATURE_X86_MISALIGNSSE,   /* Support for misaligned SSE data instructions */
 +    GMX_CPUID_FEATURE_X86_MMX,           /* MMX registers and instructions               */
 +    GMX_CPUID_FEATURE_X86_MSR,           /* Supports Intel model-specific-registers      */
 +    GMX_CPUID_FEATURE_X86_NONSTOP_TSC,   /* Invariant TSC (constant rate in ACPI states) */
 +    GMX_CPUID_FEATURE_X86_PCID,          /* Process context identifier support           */
 +    GMX_CPUID_FEATURE_X86_PCLMULDQ,      /* Carry-less 64-bit multiplication supported   */
 +    GMX_CPUID_FEATURE_X86_PDCM,          /* Perfmon and Debug Capability                 */
 +    GMX_CPUID_FEATURE_X86_PDPE1GB,       /* Support for 1GB pages                        */
 +    GMX_CPUID_FEATURE_X86_POPCNT,        /* Supports the POPCNT (population count) insn  */
 +    GMX_CPUID_FEATURE_X86_PSE,           /* Supports 4MB-pages (page size extension)     */
 +    GMX_CPUID_FEATURE_X86_RDRND,         /* RDRAND high-quality hardware random numbers  */
 +    GMX_CPUID_FEATURE_X86_RDTSCP,        /* Serializing rdtscp instruction available     */
 +    GMX_CPUID_FEATURE_X86_SSE2,          /* SSE 2                                        */
 +    GMX_CPUID_FEATURE_X86_SSE3,          /* SSE 3                                        */
 +    GMX_CPUID_FEATURE_X86_SSE4A,         /* SSE 4A                                       */
 +    GMX_CPUID_FEATURE_X86_SSE4_1,        /* SSE 4.1                                      */
 +    GMX_CPUID_FEATURE_X86_SSE4_2,        /* SSE 4.2                                      */
 +    GMX_CPUID_FEATURE_X86_SSSE3,         /* Supplemental SSE3                            */
 +    GMX_CPUID_FEATURE_X86_TDT,           /* TSC deadline timer                           */
 +    GMX_CPUID_FEATURE_X86_X2APIC,        /* Extended xAPIC Support                       */
 +    GMX_CPUID_FEATURE_X86_XOP,           /* AMD extended instructions, only AMD for now  */
 +    GMX_CPUID_NFEATURES
 +};
 +
 +
 +/* Currently supported acceleration instruction sets, intrinsics or other similar combinations
 + * in Gromacs. There is not always a 1-to-1 correspondence with feature flags; on some AMD
 + * hardware we prefer to use 128bit AVX instructions (although 256-bit ones could be executed),
 + * and we still haven't written the AVX2 kernels.
 + */
 +enum gmx_cpuid_acceleration
 +{
 +    GMX_CPUID_ACCELERATION_CANNOTDETECT,    /* Should only be used if something fails */
 +    GMX_CPUID_ACCELERATION_NONE,
 +    GMX_CPUID_ACCELERATION_X86_SSE2,
 +    GMX_CPUID_ACCELERATION_X86_SSE4_1,
 +    GMX_CPUID_ACCELERATION_X86_AVX_128_FMA,
 +    GMX_CPUID_ACCELERATION_X86_AVX_256,
++    GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE,
 +    GMX_CPUID_NACCELERATIONS
 +};
 +
 +/* Text strings corresponding to CPU vendors */
 +extern const char *
 +gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS];
 +
 +/* Text strings for CPU feature indices */
 +extern const char *
 +gmx_cpuid_feature_string[GMX_CPUID_NFEATURES];
 +
 +/* Text strings for Gromacs acceleration/instruction sets */
 +extern const char *
 +gmx_cpuid_acceleration_string[GMX_CPUID_NACCELERATIONS];
 +
 +
 +/* Abstract data type with CPU detection information. Set by gmx_cpuid_init(). */
 +typedef struct gmx_cpuid *
 +    gmx_cpuid_t;
 +
 +
 +/* Fill the data structure by using CPU detection instructions.
 + * Return 0 on success, 1 if something bad happened.
 + */
 +int
 +gmx_cpuid_init              (gmx_cpuid_t *              cpuid);
 +
 +
 +/* Return the vendor id as enumerated type. Use gmx_cpuid_vendor_string[]
 + * to get the corresponding text string.
 + */
 +enum gmx_cpuid_vendor
 +gmx_cpuid_vendor            (gmx_cpuid_t                cpuid);
 +
 +
 +/* Return a constant pointer to the processor brand string. */
 +const char *
 +gmx_cpuid_brand             (gmx_cpuid_t                cpuid);
 +
 +
 +/* Return processor family version. For a chip of version 1.2.3, this is 1 */
 +int
 +gmx_cpuid_family            (gmx_cpuid_t                cpuid);
 +
 +/* Return processor model version, For a chip of version 1.2.3, this is 2. */
 +int
 +gmx_cpuid_model             (gmx_cpuid_t                cpuid);
 +
 +/* Return processor stepping version, For a chip of version 1.2.3, this is 3. */
 +int
 +gmx_cpuid_stepping          (gmx_cpuid_t                cpuid);
 +
 +
 +/* Check whether a particular CPUID feature is set.
 + * Returns 0 if flag "feature" is not set, 1 if the flag is set. We cannot use
 + * gmx_bool here since this file must be possible to compile without simple.h.
 + */
 +int
 +gmx_cpuid_feature           (gmx_cpuid_t                cpuid,
 +                             enum gmx_cpuid_feature     feature);
 +
 +
 +/* Return pointers to cpu topology information.
 + *
 + * Important: CPU topology requires more OS support than most other
 + * functions in this file, including support for thread pinning to hardware.
 + * This means it will not work on some platforms, including e.g. Mac OS X.
 + * Thus, it is IMPERATIVE that you check the return value from this routine
 + * before doing anything with the information. It is only if the return
 + * value is zero that the data is valid.
 + *
 + * For the returned values we have:
 + * - nprocessors         Total number of logical processors reported by OS
 + * - npackages           Usually number of CPU sockets
 + * - ncores_per_package  Number of cores in each package
 + * - nhwthreads_per_core Number of hardware threads per core; 2 for hyperthreading.
 + * - package_id          Array with the package index for each logical cpu
 + * - core_id             Array with local core index for each logical cpu
 + * - hwthread_id         Array with local hwthread index for each logical cpu
 + * - locality_order      Array with logical cpu numbers, sorted in order
 + *                       of physical and logical locality in the system.
 + *
 + * All arrays are of length nprocessors.
 + */
 +int
 +gmx_cpuid_topology(gmx_cpuid_t        cpuid,
 +                   int *              nprocessors,
 +                   int *              npackages,
 +                   int *              ncores_per_package,
 +                   int *              nhwthreads_per_core,
 +                   const int **       package_id,
 +                   const int **       core_id,
 +                   const int **       hwthread_id,
 +                   const int **       locality_order);
 +
 +/* Enumerated values for x86 SMT enabled-status. Note that this does not refer
 + * to Hyper-Threading support (that is the flag GMX_CPUID_FEATURE_X86_HTT), but
 + * whether Hyper-Threading is _enabled_ and _used_ in bios right now.
 + */
 +enum gmx_cpuid_x86_smt
 +{
 +    GMX_CPUID_X86_SMT_CANNOTDETECT,
 +    GMX_CPUID_X86_SMT_DISABLED,
 +    GMX_CPUID_X86_SMT_ENABLED
 +};
 +
 +/* Returns the status of x86 SMT support. IMPORTANT: There are non-zero
 + * return values for this routine that still do not indicate supported and
 + * enabled smt/Hyper-Threading. You need to carefully check the return value
 + * against the enumerated type values to see what you are getting.
 + *
 + * Long-term, this functionality will move to a new hardware topology detection
 + * layer, but that will require a lot of new code and a working interface to the
 + * hwloc library. Surprisingly, there is no simple way to find out that
 + * Hyper-Threading is actually turned on without fully enumerating and checking
 + * all the cores, which we presently can only do on Linux. This means a couple
 + * of things:
 + *
 + * 1) If you want to know whether your CPU _supports_ Hyper-Threading in the
 + *    first place, check the GMX_CPUID_FEATURE_X86_HTT flag instead!
 + * 2) There are several scenarios where this routine will say that it cannot
 + *    detect whether SMT is enabled and used right now.
 + * 3) If you need support on non-Linux x86, you have to write it :-)
 + * 4) Don't invest too much efforts, since this will be replaced with
 + *    full hardware topology detection in the future.
 + * 5) Don't worry if the detection does not work. It is not a catastrophe, but
 + *    but we get slightly better performance on x86 if we use Hyper-Threading
 + *    cores in direct space, but not reciprocal space.
 + *
 + * Since this routine presently only supports Hyper-Threading we say X86_SMT
 + * in order not to give the impression we can detect any SMT. We haven't
 + * even tested the performance on other SMT implementations, so it is not
 + * obvious we shouldn't use SMT there.
 + *
 + * Note that you can get more complete topology information from
 + * gmx_cpuid_topology(), although that requires slightly more OS support.
 + */
 +enum gmx_cpuid_x86_smt
 +gmx_cpuid_x86_smt(gmx_cpuid_t cpuid);
 +
 +
 +/* Formats a text string (up to n characters) from the data structure.
 + * The output will have max 80 chars between newline characters.
 + */
 +int
 +gmx_cpuid_formatstring      (gmx_cpuid_t                cpuid,
 +                             char *                     s,
 +                             int                        n);
 +
 +
 +/* Suggests a suitable gromacs acceleration based on the support in the
 + * hardware.
 + */
 +enum gmx_cpuid_acceleration
 +gmx_cpuid_acceleration_suggest  (gmx_cpuid_t                    cpuid);
 +
 +
 +/* Check if this binary was compiled with the same acceleration as we
 + * would suggest for the current hardware. Always print stats to the log file
 + * if it is non-NULL, and print a warning in stdout if we don't have a match.
 + */
 +int
 +gmx_cpuid_acceleration_check    (gmx_cpuid_t                cpuid,
 +                                 FILE *                     log);
 +
 +
 +/* Release resources used by data structure. Note that the pointer to the
 + * CPU brand string will no longer be valid once this routine has been called.
 + */
 +void
 +gmx_cpuid_done              (gmx_cpuid_t                cpuid);
 +
 +
 +
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +
 +#endif /* GMX_CPUID_H_ */
index 6ac5fff78c6ff146950789e25402b8c686541db8,b6d8d9a333c21c7a909285f95697866b12a2599d..b6d8d9a333c21c7a909285f95697866b12a2599d
@@@ -86,25 -86,22 +86,22 @@@ extern "C
  } /* Avoids screwing up auto-indentation */
  #endif
  
+ /* first check for gcc/icc platforms.
+    Some compatible compilers, like icc on linux+mac will take this path,
+    too */
+ #if ( (defined(__GNUC__) || defined(__PATHSCALE__) || defined(__PGI)) && \
+     (!defined(__xlc__)) && (!defined(TMPI_TEST_NO_ATOMICS)) )
  #ifdef __GNUC__
  #define TMPI_GCC_VERSION (__GNUC__ * 10000 \
                            + __GNUC_MINOR__ * 100 \
                            + __GNUC_PATCHLEVEL__)
  #endif
  
- /* first check for gcc/icc platforms.
-    Some compatible compilers, like icc on linux+mac will take this path,
-    too */
- #if ( (defined(__GNUC__) || defined(__PATHSCALE__) || defined(__PGI)) && (!defined(__xlc__)) )
  /* now check specifically for several architectures: */
  #if ((defined(i386) || defined(__x86_64__)) && !defined(__OPEN64__))
  /* first x86: */
  #include "atomic/gcc_x86.h"
- /*#include "atomic/gcc.h"*/
  
  #elif (defined(__ia64__))
  /* then ia64: */
  /*#elif (defined(__powerpc__) || (defined(__ppc__)) )*/
  /*#include "atomic/gcc_ppc.h"*/
  
+ #elif defined(__FUJITSU) && ( defined(__sparc_v9__) || defined (__sparcv9) )
+ /* Fujitsu FX10 SPARC compiler */
+ #include "atomic/fujitsu_sparc.h"
  #else
  /* otherwise, there's a generic gcc intrinsics version: */
  #include "atomic/gcc.h"
  #endif /* end of check for gcc specific architectures */
  
  /* not gcc: */
- #elif (defined(_MSC_VER) && (_MSC_VER >= 1200))
+ #elif (defined(_MSC_VER) && (_MSC_VER >= 1200) && \
+     (!defined(TMPI_TEST_NO_ATOMICS)) )
  /* Microsoft Visual C on x86, define taken from FFTW who got it from
     Morten Nissov. icc on windows will take this path.  */
  #include "atomic/msvc.h"
  
  #elif ( (defined(__IBM_GCC_ASM) || defined(__IBM_STDCPP_ASM))  && \
-     (defined(__powerpc__) || defined(__ppc__)))
+     (defined(__powerpc__) || defined(__ppc__)) && \
+     (!defined(TMPI_TEST_NO_ATOMICS)) )
  
  /* PowerPC using xlC intrinsics.  */
  
  #include "atomic/xlc_ppc.h"
  
- #elif defined(__xlC__)  || defined(__xlc__)
+ #elif ( ( defined(__xlC__)  || defined(__xlc__) ) && \
+     (!defined(TMPI_TEST_NO_ATOMICS)) )
  /* IBM xlC compiler */
  #include "atomic/xlc_ppc.h"
  
  
- #elif defined (__sun) && (defined(__sparcv9) || defined(__sparc))
+ #elif (defined (__sun) && (defined(__sparcv9) || defined(__sparc)) && \
+     (!defined(TMPI_TEST_NO_ATOMICS)) )
  /* Solaris on SPARC (Sun C Compiler, Solaris Studio) */
  #include "atomic/suncc-sparc.h"
  
+ #elif defined(__FUJITSU) && defined(__sparc__)
  
+ /* Fujitsu FX10 SPARC compiler requires gcc compatibility with -Xg */
+ #error Atomics support for Fujitsu FX10 compiler requires -Xg (gcc compatibility)
  
  
  #else
  #error No atomic operations implemented for this cpu/compiler combination.
  #endif
  
+ #ifndef DOXYGEN
  /** Indicates that no support for atomic operations is present. */
  #define TMPI_NO_ATOMICS
+ #endif
  
  /** Memory barrier operation
  
   */
  #define tMPI_Atomic_memory_barrier_rel()
  
- /** System mutex used for locking to guarantee atomicity */
- static tMPI_Thread_mutex_t tMPI_Atomic_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+ #ifndef DOXYGEN
+ /* signal that they exist */
+ #define TMPI_HAVE_ACQ_REL_BARRIERS
+ #endif
  
  /** Atomic operations datatype
   *
   *  - PowerPC, using GNU compilers
   *  - PowerPC, using IBM AIX compilers
   *  - PowerPC, using IBM compilers >=7.0 under Linux or Mac OS X.
+  *  - Sparc64, using Fujitsu compilers.
   *
   * \see
   * - tMPI_Atomic_get
   */
  typedef struct tMPI_Atomic
  {
-     int value;  /**< The atomic value. */
+     int value; /**< The atomic value.*/
  }
  tMPI_Atomic_t;
  
   */
  typedef struct tMPI_Atomic_ptr
  {
-     void* value;  /**< The atomic pointer value. */
+     void *value; /**< The atomic pointer. */
  }
  tMPI_Atomic_ptr_t;
  
   * - tMPI_Spinlock_trylock
   * - tMPI_Spinlock_wait
   */
- typedef struct
- {
- #ifndef DOXYGEN
-     tMPI_Thread_mutex_t lock; /* we don't want this documented */
- #endif
- } tMPI_Spinlock_t;
- /*#define tMPI_Spinlock_t     tMPI_Thread_mutex_t*/
+ typedef struct tMPI_Spinlock *tMPI_Spinlock_t;
  
  /*! \def TMPI_SPINLOCK_INITIALIZER
   * \brief Spinlock static initializer
   *
   *  \hideinitializer
   */
- #  define TMPI_SPINLOCK_INITIALIZER   { TMPI_THREAD_MUTEX_INITIALIZER }
+ #define TMPI_SPINLOCK_INITIALIZER   { NULL }
  
  /* Since mutexes guarantee memory barriers this works fine */
  /** Return value of an atomic integer
   *
   *  \hideinitializer
   */
- #ifdef DOXYGEN
- static inline int tMPI_Atomic_get(tMPI_Atomic_t &a);
- #else
- #define tMPI_Atomic_get(a)   ((a)->value)
- #endif
+ TMPI_EXPORT
+ int tMPI_Atomic_get(const tMPI_Atomic_t *a);
  
  /** Write value to an atomic integer
   *
   *  \hideinitializer
   */
  TMPI_EXPORT
- static inline void tMPI_Atomic_set(tMPI_Atomic_t *a, int i)
- {
-     /* Mutexes here are necessary to guarantee memory visibility */
-     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
-     a->value = i;
-     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
- }
+ void tMPI_Atomic_set(tMPI_Atomic_t *a, int i);
  
  
  /** Return value of an atomic pointer
   *
   *  \hideinitializer
   */
- #ifdef DOXYGEN
- static inline void* tMPI_Atomic_ptr_get(tMPI_Atomic_ptr_t &a);
- #else
- #define tMPI_Atomic_ptr_get(a)   ((a)->value)
- #endif
+ TMPI_EXPORT
+ void* tMPI_Atomic_ptr_get(const tMPI_Atomic_ptr_t *a);
  
  
  
   *  \hideinitializer
   */
  TMPI_EXPORT
- static inline void tMPI_Atomic_ptr_set(tMPI_Atomic_t *a, void *p)
- {
-     /* Mutexes here are necessary to guarantee memory visibility */
-     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
-     a->value = (void*)p;
-     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
- }
+ void tMPI_Atomic_ptr_set(tMPI_Atomic_ptr_t *a, void *p);
  
  /** Add integer to atomic variable
   *
   *  \return The new value (after summation).
   */
  TMPI_EXPORT
- static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
- {
-     int t;
-     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
-     t        = a->value + i;
-     a->value = t;
-     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
-     return t;
- }
+ int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i);
+ #ifndef DOXYGEN
+ #define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
+ #endif
  
  
  
   *  \return    The value of the atomic variable before addition.
   */
  TMPI_EXPORT
- static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
- {
-     int old_value;
-     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
-     old_value  = a->value;
-     a->value   = old_value + i;
-     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
-     return old_value;
- }
+ int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i);
+ #ifndef DOXYGEN
+ #define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
+ #endif
  
  
  
   *   \note   The exchange occured if the return value is identical to \a old.
   */
  TMPI_EXPORT
- static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int old_val, int new_val)
- {
-     int t = 0;
-     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
-     if (a->value == old_val)
-     {
-         a->value = new_val;
-         t        = 1;
-     }
-     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
-     return t;
- }
+ int tMPI_Atomic_cas(tMPI_Atomic_t *a, int old_val, int new_val);
  
  
  
   *   \note   The exchange occured if the return value is identical to \a old.
   */
  TMPI_EXPORT
- static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t * a, void *old_val,
-                                       void *new_val)
- {
-     int t = 0;
-     tMPI_Thread_mutex_lock(&tMPI_Atomic_mutex);
-     if (a->value == old_val)
-     {
-         a->value = new_val;
-         t        = 1;
-     }
-     tMPI_Thread_mutex_unlock(&tMPI_Atomic_mutex);
-     return t;
- }
+ int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t * a, void *old_val,
+                         void *new_val);
+ /** Atomic swap operation.
+    Atomically swaps the data in the tMPI_Atomic_t operand with the value of b.
+    Note: This has no good assembly counterparts on many architectures, so
+          it might not be faster than a repreated CAS.
+    \param a  Pointer to atomic type
+    \param b  Value to swap
+    \return the original value of a
+  */
+ TMPI_EXPORT
+ int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b);
+ /** Atomic swap pointer operation.
+    Atomically swaps the pointer in the tMPI_Atomic_ptr_t operand with the
+    value of b.
+    Note: This has no good assembly counterparts on many architectures, so
+          it might not be faster than a repreated CAS.
+    \param a  Pointer to atomic type
+    \param b  Value to swap
+    \return the original value of a
+  */
+ TMPI_EXPORT
+ void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b);
+ #ifndef DOXYGEN
+ #define TMPI_ATOMIC_HAVE_NATIVE_SWAP
+ #endif
  
  
  /** Initialize spinlock
   *
   *  \hideinitializer
   */
- #ifdef DOXYGEN
- void tMPI_Spinlock_init( tMPI_Spinlock_t &x);
- #else
- #define tMPI_Spinlock_init(x)       tMPI_Thread_mutex_init((x)->lock)
+ TMPI_EXPORT
+ void tMPI_Spinlock_init( tMPI_Spinlock_t *x);
+ #ifndef DOXYGEN
+ #define TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
  #endif
  
  /** Acquire spinlock
   *
   *  \param x     Spinlock pointer
   */
- #ifdef DOXYGEN
- void tMPI_Spinlock_lock( tMPI_Spinlock_t &x);
- #else
- #define tMPI_Spinlock_lock(x)       tMPI_Thread_mutex_lock((x)->lock)
- #endif
+ TMPI_EXPORT
+ void tMPI_Spinlock_lock( tMPI_Spinlock_t *x);
  
  
  /** Attempt to acquire spinlock
   * \return 0 if the mutex was available so we could lock it,
   *         otherwise a non-zero integer (1) if the lock is busy.
   */
- #ifdef DOXYGEN
- void tMPI_Spinlock_trylock( tMPI_Spinlock_t &x);
- #else
- #define tMPI_Spinlock_trylock(x)    tMPI_Thread_mutex_trylock((x)->lock)
- #endif
+ TMPI_EXPORT
+ int tMPI_Spinlock_trylock( tMPI_Spinlock_t *x);
  
  /** Release spinlock
   *
   *
   *  Unlocks the spinlock, regardless if which thread locked it.
   */
- #ifdef DOXYGEN
- void tMPI_Spinlock_unlock( tMPI_Spinlock_t &x);
- #else
- #define tMPI_Spinlock_unlock(x)     tMPI_Thread_mutex_unlock((x)->lock)
- #endif
+ TMPI_EXPORT
+ void tMPI_Spinlock_unlock( tMPI_Spinlock_t *x);
  
  
  
   *  \return 1 if the spinlock is locked, 0 otherwise.
   */
  TMPI_EXPORT
- static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
- {
-     if (tMPI_Spinlock_trylock(x) != 0)
-     {
-         /* It was locked */
-         return 1;
-     }
-     else
-     {
-         /* We just locked it */
-         tMPI_Spinlock_unlock(x);
-         return 0;
-     }
- }
+ int tMPI_Spinlock_islocked( tMPI_Spinlock_t *x);
  
  /** Wait for a spinlock to become available
   *
   *  \param x  Spinlock pointer
   */
  TMPI_EXPORT
- static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
- {
-     tMPI_Spinlock_lock(x);
-     /* Got the lock now, so the waiting is over */
-     tMPI_Spinlock_unlock(x);
- }
- #endif
- /* only do this if there was no better solution */
- #ifndef TMPI_HAVE_SWAP
- /** Atomic swap operation.
-    Atomically swaps the data in the tMPI_Atomic_t operand with the value of b.
-    NOTE: DON'T USE YET! (This has no good asm counterparts on many architectures).
-    \param a  Pointer to atomic type
-    \param b  Value to swap
-    \return the original value of a
-  */
- TMPI_EXPORT
- static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
- {
-     int oldval;
-     do
-     {
-         oldval = (int)(a->value);
-     }
-     while (!tMPI_Atomic_cas(a, oldval, b));
-     return oldval;
- }
- /** Atomic swap pointer operation.
-    Atomically swaps the pointer in the tMPI_Atomic_ptr_t operand with the
-    value of b.
-    NOTE: DON'T USE YET! (This has no good asm counterparts on many architectures).
-    \param a  Pointer to atomic type
-    \param b  Value to swap
-    \return the original value of a
-  */
- TMPI_EXPORT
- static inline void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
- {
-     void *oldval;
-     do
-     {
-         oldval = (void*)(a->value);
-     }
-     while (!tMPI_Atomic_ptr_cas(a, oldval, b));
-     return oldval;
- }
- #endif
+ void tMPI_Spinlock_wait(tMPI_Spinlock_t *x);
  
- /* only define this if there were no separate acquire and release barriers */
- #ifndef TMPI_HAVE_ACQ_REL_BARRIERS
  
- /* if they're not defined explicitly, we just make full barriers out of both */
- #define tMPI_Atomic_memory_barrier_acq tMPI_Atomic_memory_barrier
- #define tMPI_Atomic_memory_barrier_rel tMPI_Atomic_memory_barrier
+ #endif /* platform-specific checks */
  
- #endif
+ /* now define all the atomics that are not avaible natively. These
+    are done on the assumption that a native CAS does exist. */
+ #include "atomic/derived.h"
  
  /* this allows us to use the inline keyword without breaking support for
     some compilers that don't support it: */
  #undef inline
  #endif
  
+ #if !defined(TMPI_NO_ATOMICS) && !defined(TMPI_ATOMICS)
+ /* Set it here to make sure the user code can check this without having to have
+    a config.h */
+ /** Indicates that support for atomic operations is present. */
+ #define TMPI_ATOMICS
+ #endif
  
  #ifdef __cplusplus
  }
index 9d477d441ce84f968da705ea43ccb38b3c391bb2,a4c7f2787da24f60980f9f6471852e3d3f0a1325..a4c7f2787da24f60980f9f6471852e3d3f0a1325
@@@ -7,13 -7,13 +7,13 @@@
  #if ((defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__)  || defined(__PGIC__)) && (defined(__i386__) || defined(__x86_64__)))
  #define TMPI_CYCLE_COUNT
  /* x86 or x86-64 with GCC inline assembly */
- typedef unsigned long long tmpi_cycles_t;
+ typedef unsigned long long tMPI_Cycles_t;
  
- static __inline__ tmpi_cycles_t tmpi_cycles_read(void)
+ static __inline__ tMPI_Cycles_t tMPI_Cycles_read(void)
  {
      /* x86 with GCC inline assembly - pentium TSC register */
-     tmpi_cycles_t   cycle;
-     unsigned        low, high;
+     tMPI_Cycles_t cycle;
+     unsigned      low, high;
  
  #ifdef HAVE_RDTSCP
      __asm__ __volatile__("rdtscp" : "=a" (low), "=d" (high) :: "ecx" );
  }
  #elif (defined(__INTEL_COMPILER) && defined(__ia64__))
  #define TMPI_CYCLE_COUNT
- typedef unsigned long tmpi_cycles_t;
- static __inline__ tmpi_cycles_t tmpi_cycles_read(void)
+ typedef unsigned long tMPI_Cycles_t;
+ static __inline__ tMPI_Cycles_t tMPI_Cycles_read(void)
  {
      /* Intel compiler on ia64 */
      return __getReg(_IA64_REG_AR_ITC);
  }
  #elif defined(__GNUC__) && defined(__ia64__)
  #define TMPI_CYCLE_COUNT
- typedef unsigned long tmpi_cycles_t;
- static __inline__ tmpi_cycles_t tmpi_cycles_read(void)
+ typedef unsigned long tMPI_Cycles_t;
+ static __inline__ tMPI_Cycles_t tMPI_Cycles_read(void)
  {
      /* ia64 with GCC inline assembly */
-     tmpi_cycles_t ret;
+     tMPI_Cycles_t ret;
      __asm__ __volatile__ ("mov %0=ar.itc" : "=r" (ret));
      return ret;
  }
  #elif defined(_MSC_VER)
  #define TMPI_CYCLE_COUNT
- typedef __int64 tmpi_cycles_t;
- static __inline tmpi_cycles_t tmpi_cycles_read(void)
+ typedef __int64 tMPI_Cycles_t;
+ static __inline tMPI_Cycles_t tMPI_Cycles_read(void)
  {
  #ifdef HAVE_RDTSCP
      unsigned int ui;
index 0000000000000000000000000000000000000000,68b0fe5873c6eeb21ed5a9707360f1cd24040e54..68b0fe5873c6eeb21ed5a9707360f1cd24040e54
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,186 +1,186 @@@
+ /*
+    This source code file is part of thread_mpi.
+    Written by Sander Pronk, Erik Lindahl, and possibly others.
+    Copyright (c) 2013, Sander Pronk, Erik Lindahl.
+    All rights reserved.
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+    1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+    3) Neither the name of the copyright holders nor the
+    names of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written permission.
+    THIS SOFTWARE IS PROVIDED BY US ''AS IS'' AND ANY
+    EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL WE BE LIABLE FOR ANY
+    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+    If you want to redistribute modifications, please consider that
+    scientific software is very special. Version control is crucial -
+    bugs must be traceable. We will be happy to consider code for
+    inclusion in the official distribution, but derived work should not
+    be called official thread_mpi. Details are found in the README & COPYING
+    files.
+  */
+ /* These functions are fallback definitions for when there are no native
+    variants for fetch-add, spinlock, etc., but there is a native
+    compare-and-swap. */
+ /* only define this if there were no separate acquire and release barriers */
+ #ifndef TMPI_HAVE_ACQ_REL_BARRIERS
+ /* if they're not defined explicitly, we just make full barriers out of both */
+ #define tMPI_Atomic_memory_barrier_acq tMPI_Atomic_memory_barrier
+ #define tMPI_Atomic_memory_barrier_rel tMPI_Atomic_memory_barrier
+ #endif
+ #ifndef TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
+ TMPI_EXPORT
+ static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
+ {
+     int newval, oldval;
+     do
+     {
+         tMPI_Atomic_memory_barrier_acq();
+         oldval = tMPI_Atomic_get(a);
+         newval = oldval + i;
+     }
+     while (!tMPI_Atomic_cas(a, oldval, newval));
+     tMPI_Atomic_memory_barrier_rel();
+     return oldval;
+ }
+ #endif /* TMPI_HAVE_FETCH_ADD */
+ #ifndef TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
+ TMPI_EXPORT
+ static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
+ {
+     /* implement in terms of fetch-add */
+     return tMPI_Atomic_fetch_add(a, i) + i;
+ }
+ #endif /* TMPI_HAVE_ADD_RETURN */
+ /* only do this if there was no better solution */
+ #ifndef TMPI_ATOMIC_HAVE_NATIVE_SWAP
+ TMPI_EXPORT
+ static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
+ {
+     int oldval;
+     do
+     {
+         oldval = (int)(a->value);
+     }
+     while (!tMPI_Atomic_cas(a, oldval, b));
+     return oldval;
+ }
+ TMPI_EXPORT
+ static inline void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
+ {
+     void *oldval;
+     do
+     {
+         oldval = (void*)(a->value);
+     }
+     while (!tMPI_Atomic_ptr_cas(a, oldval, b));
+     return oldval;
+ }
+ #endif
+ #ifndef TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
+ typedef struct tMPI_Spinlock
+ {
+     tMPI_Atomic_t a;
+ }
+ tMPI_Spinlock_t;
+ #define TMPI_SPINLOCK_INITIALIZER   { 0 }
+ TMPI_EXPORT
+ static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
+ {
+     tMPI_Atomic_set(&(x->a), 0);
+ }
+ TMPI_EXPORT
+ static inline void tMPI_Spinlock_lock(tMPI_Spinlock_t *x)
+ {
+     tMPI_Atomic_memory_barrier_acq();
+     do
+     {
+         while (tMPI_Atomic_get(&(x->a)) == 1)
+         {
+             tMPI_Atomic_memory_barrier_acq();
+         }
+     }
+     while (!tMPI_Atomic_cas(&(x->a), 0, 1));
+     tMPI_Atomic_memory_barrier_acq();
+ }
+ TMPI_EXPORT
+ static inline int tMPI_Spinlock_trylock(tMPI_Spinlock_t *x)
+ {
+     int ret;
+     tMPI_Atomic_memory_barrier_acq();
+     ret = !tMPI_Atomic_cas(&(x->a), 0, 1);
+     return ret;
+ }
+ TMPI_EXPORT
+ static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *x)
+ {
+     tMPI_Atomic_memory_barrier_rel();
+     tMPI_Atomic_set(&(x->a), 0);
+     tMPI_Atomic_memory_barrier_rel();
+ }
+ TMPI_EXPORT
+ static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
+ {
+     int ret;
+     tMPI_Atomic_memory_barrier_rel();
+     ret = (tMPI_Atomic_get(&(x->a)) != 0);
+     return ret;
+ }
+ TMPI_EXPORT
+ static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
+ {
+     do
+     {
+     }
+     while (tMPI_Spinlock_islocked(x));
+ }
+ #endif /* TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK */
index 0000000000000000000000000000000000000000,05663387efd574a33a5fe3706d4750adc831b132..05663387efd574a33a5fe3706d4750adc831b132
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,88 +1,88 @@@
+ /*
+    This source code file is part of thread_mpi.
+    Written by Sander Pronk, Erik Lindahl, and possibly others.
+    Copyright (c) 2013, Sander Pronk, Erik Lindahl.
+    All rights reserved.
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+    1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+    3) Neither the name of the copyright holders nor the
+    names of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written permission.
+    THIS SOFTWARE IS PROVIDED BY US ''AS IS'' AND ANY
+    EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL WE BE LIABLE FOR ANY
+    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+    If you want to redistribute modifications, please consider that
+    scientific software is very special. Version control is crucial -
+    bugs must be traceable. We will be happy to consider code for
+    inclusion in the official distribution, but derived work should not
+    be called official thread_mpi. Details are found in the README & COPYING
+    files.
+  */
+ #define tMPI_Atomic_memory_barrier() { asm ("membar   #StoreStore | #LoadStore | #LoadLoad | #StoreLoad "); }
+ #define tMPI_Atomic_memory_barrier_acq() { asm ("membar   #StoreStore | #StoreLoad ");  }
+ #define tMPI_Atomic_memory_barrier_rel() { asm ("membar   #LoadStore | #StoreStore ");  }
+ #define TMPI_HAVE_ACQ_REL_BARRIERS
+ typedef struct tMPI_Atomic
+ {
+     volatile int value __attribute__ ((aligned(64)));
+ }
+ tMPI_Atomic_t;
+ typedef struct tMPI_Atomic_ptr
+ {
+     volatile char* volatile* value __attribute__ ((aligned(64)));  /*!< Volatile, to avoid compiler aliasing */
+ }
+ tMPI_Atomic_ptr_t;
+ /* On sparc64, aligned 32-bit and 64-bit memory accesses are atomic */
+ #define tMPI_Atomic_get(a)   (int)((a)->value)
+ #define tMPI_Atomic_set(a, i)  (((a)->value) = (i))
+ #define tMPI_Atomic_ptr_get(a)   ((a)->value)
+ #define tMPI_Atomic_ptr_set(a, i)  (((a)->value) = (i))
+ #define TMPI_SPINLOCK_INITIALIZER   { 0 }
+ /* we just define the CAS operation. Fetch-and-add and spinlocks are
+    implemented through derived.h; this follows the recommendations of the
+    Sparc v9 programming specs. */
+ static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
+ {
+     asm ("cas [%2], %1, %0"
+          : "=&r" (newval)
+          : "r" (oldval), "r" (&(a->value)), "0" (newval)
+          : "memory");
+     return newval == oldval;
+ }
+ static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t *a, void* oldval,
+                                       void* newval)
+ {
+     asm ("casx [%2], %1, %0         "
+          : "=&r" (newval)
+          : "r" (oldval), "r" (&(a->value)), "0" (newval)
+          : "memory");
+     return newval == oldval;
+ }
index 756f1bd792de96f3c1f1de9f10b32f43669ac713,39537f3f4f9e9d4a494578177339c6d3a36e0fde..39537f3f4f9e9d4a494578177339c6d3a36e0fde
@@@ -60,10 -60,6 +60,6 @@@ typedef struct tMPI_Atomic_pt
  tMPI_Atomic_ptr_t;
  
  
- #define TMPI_SPINLOCK_INITIALIZER   { 0 }
  /* for now we simply assume that int and void* assignments are atomic */
  #define tMPI_Atomic_get(a)  ((int)( (a)->value) )
  #define tMPI_Atomic_set(a, i)  (((a)->value) = (i))
index 982853ca426f3ee66d088f262da31e9205d67a3e,f3c3a174b0157e1129d4ca93fe72dceea2c628e9..f3c3a174b0157e1129d4ca93fe72dceea2c628e9
@@@ -55,8 -55,6 +55,6 @@@ typedef struct tMPI_Atomic_pt
  tMPI_Atomic_ptr_t;
  
  
- #define TMPI_SPINLOCK_INITIALIZER   { 0 }
  
  #define tMPI_Atomic_get(a)   ((a)->value)
  #define tMPI_Atomic_set(a, i)  (((a)->value) = (i))
@@@ -67,7 -65,7 +65,7 @@@
  
  
  #ifndef __INTEL_COMPILER
- #define TMPI_HAVE_SWAP
+ #define TMPI_ATOMIC_HAVE_NATIVE_SWAP
  /* xchg operations: */
  /* ia64 xchg */
  static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
@@@ -112,7 -110,6 +110,6 @@@ int _InterlockedCompareExchange(volatil
                                           void* comp);*/
  unsigned __int64 __fetchadd4_rel(unsigned int *addend, const int increment);
  /* ia64 memory barrier */
- /*#define tMPI_Atomic_memory_barrier() __memory_barrier()*/
  #define tMPI_Atomic_memory_barrier() __sync_synchronize()
  /* ia64 cmpxchg */
  #define tMPI_Atomic_cas(a, oldval, newval) \
  /* ia64 fetchadd, but it only works with increments +/- 1,4,8,16 */
  #define tMPI_ia64_fetchadd(a, inc)  __fetchadd4_rel(a, inc)
  
- #define TMPI_HAVE_SWAP
  #define tMPI_Atomic_swap(a, b) _InterlockedExchange( &((a)->value), (b))
  #define tMPI_Atomic_ptr_swap(a, b) _InterlockedExchangePointer( &((a)->value), (b))
+ #define TMPI_ATOMIC_HAVE_NATIVE_SWAP
  
  #elif defined __GNUC__
  
@@@ -213,6 -210,7 +210,7 @@@ static inline int tMPI_Atomic_add_retur
      }
      return (int)newval;
  }
+ #define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
  
  
  
@@@ -244,73 -242,7 +242,7 @@@ static inline int tMPI_Atomic_fetch_add
      }
      return (int)oldval;
  }
- typedef struct tMPI_Spinlock
- {
-     volatile unsigned int   lock; /*!< Volatile, to avoid compiler aliasing */
- }
- tMPI_Spinlock_t;
- static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
- {
-     x->lock = 0;
- }
- static inline void tMPI_Spinlock_lock(tMPI_Spinlock_t *x)
- {
-     tMPI_Atomic_t *a = (tMPI_Atomic_t *) x;
-     int            succeeded;
-     succeeded = tMPI_Atomic_cas(a, 0, 1);
-     if (!succeeded)
-     {
-         do
-         {
-             while (a->value != 0)
-             {
-                 tMPI_Atomic_memory_barrier();
-             }
-             succeeded = tMPI_Atomic_cas(a, 0, 1);
-         }
-         while (!succeeded);
-     }
- }
- static inline int tMPI_Spinlock_trylock(tMPI_Spinlock_t *x)
- {
-     return (tMPI_Atomic_cas( ((tMPI_Atomic_t *)x), 0, 1));
- }
- static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *x)
- {
-     do
-     {
-         tMPI_Atomic_memory_barrier();
-         x->lock = 0;
-     }
-     while (0);
- }
- static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
- {
-     return (x->lock != 0);
- }
- static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
- {
-     do
-     {
-         tMPI_Atomic_memory_barrier();
-     }
-     while (tMPI_Spinlock_islocked(x));
- }
+ #define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
  
  #endif
  
index 4fe10a8dfc55f00c0c5df1759725f77d3e71aa40,f909cb91ec7026eb148e94acc47312eec73fae56..f909cb91ec7026eb148e94acc47312eec73fae56
  #define tMPI_Atomic_memory_barrier()  __sync_synchronize()
  
  
- static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, volatile int i)
- {
-     return __sync_add_and_fetch( &(a->value), i);
- }
- static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, volatile int i)
- {
-     return __sync_fetch_and_add( &(a->value), i);
- }
  
+ TMPI_EXPORT
  static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
  {
      return __sync_bool_compare_and_swap( &(a->value), oldval, newval);
  }
  
- #if 0
- /* these definitions are only used if there's no assembly versions for them:
-    they're inefficient because they use compare-and-swap instead of just
-    swap. */
- static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
- {
-     int oldval;
-     do
-     {
-         oldval = a->value;
-     }
-     while (__sync_val_compare_and_swap( &(a->value), oldval, b) != oldval);
-     return oldval;
- }
- static inline void* tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
- {
-     void *oldval;
-     do
-     {
-         oldval = a->value;
-     }
-     while (__sync_val_compare_and_swap( &(a->value), oldval, b) != oldval);
-     return oldval;
- }
- #endif
+ TMPI_EXPORT
  static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t* a, void *oldval,
                                        void *newval)
  {
                                            (size_t)newval) );
  #endif
  }
+ TMPI_EXPORT
+ static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, volatile int i)
+ {
+     return __sync_add_and_fetch( &(a->value), i);
+ }
+ #define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
+ TMPI_EXPORT
+ static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, volatile int i)
+ {
+     return __sync_fetch_and_add( &(a->value), i);
+ }
+ #define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
index 9123412a2546035d8aa3600a57b93abc38332fa6,928d770e5d55fb8ceae2fdb0a18148f41af523f7..928d770e5d55fb8ceae2fdb0a18148f41af523f7
@@@ -66,12 -66,11 +66,11 @@@ typedef struct tMPI_Spinloc
      volatile unsigned int lock;   /*!< Volatile, to avoid compiler aliasing */
  }
  tMPI_Spinlock_t;
+ #define TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
  
  
  #define TMPI_SPINLOCK_INITIALIZER   { 0 }
  
- #define TMPI_HAVE_SWAP
  #define tMPI_Atomic_get(a)        ((a)->value)
  #define tMPI_Atomic_set(a, i)     (((a)->value) = (i))
  
@@@ -90,7 -89,7 +89,7 @@@
  
  
  
- #define TMPI_HAVE_ASM_SWAP
+ #define TMPI_ATOMIC_HAVE_NATIVE_SWAP
  static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
  {
      int ret;
@@@ -183,6 -182,7 +182,7 @@@ static inline int tMPI_Atomic_ptr_cas(t
      return prev == oldval;
  }
  
+ #define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
  static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
  {
      int t;
  
  
  
+ #define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
  static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
  {
      int t;
index f7dbc67d7ad91fb440094d28fa5fe292aa8659bc,84a244d9004bbe700c37257c19c1279dadeb1430..84a244d9004bbe700c37257c19c1279dadeb1430
  
  typedef struct tMPI_Spinlock
  {
-     volatile unsigned int  lock /*__attribute__ ((aligned(64)))*/;
+     volatile unsigned int lock /*__attribute__ ((aligned(64)))*/;
  } tMPI_Spinlock_t;
  
  #define TMPI_SPINLOCK_INITIALIZER   { 0 }
  
+ #define TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
  
  
  static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
@@@ -83,18 -82,18 +82,18 @@@ static inline int tMPI_Spinlock_trylock
  }
  
  
- static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *  x)
+ static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *x)
  {
      __sync_lock_release(&(x->lock));
  }
  
- static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *  x)
+ static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
  {
      __sync_synchronize();
      return ( x->lock == 1 );
  }
  
- static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *   x)
+ static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
  {
      do
      {
index e6252a8893ce84c4b6c66222fa36fc469b9a3c2c,2fd496f6ebf8b306ecf8f65182d353f6be8ee4ef..2fd496f6ebf8b306ecf8f65182d353f6be8ee4ef
@@@ -79,6 -79,8 +79,8 @@@ typedef struct tMPI_Spinloc
  
  #define TMPI_SPINLOCK_INITIALIZER   { 0 }
  
+ #define TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
  
  
  /* these are guaranteed to be  atomic on x86 and x86_64 */
  
  #define tMPI_Atomic_memory_barrier() __asm__ __volatile__("sfence;" : : : "memory")
  
- static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
+ #define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
+ static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
  {
-     int __i;
-     __i = i;
-     __asm__ __volatile__("lock ; xaddl %0, %1;"
-                          : "=r" (i) : "m" (a->value), "0" (i) : "memory");
-     return i + __i;
+     volatile int res = i;
+     /* volatile because we read and write back to the same variable in the
+        asm section.  some compilers requires this to be volatile */
+     __asm__ __volatile__("lock ; xaddl %0, %1;"      /* swap-add */
+                          : "=r" (res)                /* with register as
+                                                         output*/
+                          : "m" (a->value), "0" (res) /* and memory as input */
+                          : "memory");
+     return res;
  }
  
- static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
+ #define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
+ static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
  {
+     int          orig = i;
+     volatile int res  = i;
      __asm__ __volatile__("lock ; xaddl %0, %1;"
-                          : "=r" (i) : "m" (a->value), "0" (i) : "memory");
-     return i;
+                          : "=r" (res)
+                          : "m" (a->value), "0" (res)
+                          :  "memory");
+     return res + orig; /* then add again from the right value */
  }
  
  static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
  {
      int prev;
@@@ -160,7 -174,8 +174,8 @@@ static inline int tMPI_Atomic_ptr_cas(t
  
  #endif /* end of check for gcc intrinsics */
  
- #define TMPI_HAVE_SWAP
+ #define TMPI_ATOMIC_HAVE_NATIVE_SWAP
  /* do the swap fns; we told the intrinsics that we have them. */
  static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
  {
@@@ -176,11 -191,6 +191,6 @@@ static inline void *tMPI_Atomic_ptr_swa
  {
      void *volatile *ret = (void* volatile*)b;
  #ifndef __x86_64__
- /*    __asm__ __volatile__("\txchgl %0, %1;"
-                          :"=m"(a->value),"=q"(b)
-                          :"q"(b)
-                          :"memory");
-  */
      __asm__ __volatile__("\txchgl %0, %1;"
                           : "+r" (ret), "+m" (a->value)
                           :
@@@ -235,7 -245,7 +245,7 @@@ static inline void tMPI_Spinlock_lock(t
  
  
  
- static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *  x)
+ static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *x)
  {
      /* this is apparently all that is needed for unlocking a lock */
      __asm__ __volatile__(
index 8ab8d2498f9c46ecb81f8322c06089d20c918500,50bd7dc71bd95dd20dd0f072bfb5e61450e6dfbc..50bd7dc71bd95dd20dd0f072bfb5e61450e6dfbc
@@@ -36,8 -36,6 +36,6 @@@
   */
  
  
- /* Microsoft Visual C on x86, define taken from FFTW who got it from Morten Nissov */
  /* we need this for all the data types. We use WIN32_LEAN_AND_MEAN to avoid
        polluting the global namespace. */
  #define WIN32_LEAN_AND_MEAN
  
  typedef struct tMPI_Atomic
  {
-     LONG volatile      value;     /*!< Volatile, to avoid compiler aliasing */
+     LONG volatile value;          /*!< Volatile, to avoid compiler aliasing */
  } tMPI_Atomic_t;
  
  typedef struct tMPI_Atomic_ptr
  {
-     void* volatile      value;     /*!< Volatile, to avoid compiler aliasing */
+     void* volatile value;          /*!< Volatile, to avoid compiler aliasing */
  } tMPI_Atomic_ptr_t;
  
  typedef struct tMPI_Spinlock
  {
-     LONG volatile      lock;      /*!< Volatile, to avoid compiler aliasing */
+     LONG volatile lock;           /*!< Volatile, to avoid compiler aliasing */
  } tMPI_Spinlock_t;
  
- #define TMPI_SPINLOCK_INITIALIZER   { 0 }
+ #define TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
  
- #define TMPI_HAVE_SWAP
+ #define TMPI_SPINLOCK_INITIALIZER   { 0 }
  
  
  #define tMPI_Atomic_get(a)  ((a)->value)
  
  #define tMPI_Atomic_fetch_add(a, i)  \
      InterlockedExchangeAdd((LONG volatile *)(a), (LONG) (i))
+ #define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
  
  #define tMPI_Atomic_add_return(a, i)  \
      ( (i) + InterlockedExchangeAdd((LONG volatile *)(a), (LONG) (i)) )
+ #define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
  
  #define tMPI_Atomic_cas(a, oldval, newval) \
      (InterlockedCompareExchange((LONG volatile *)(a), (LONG) (newval), (LONG) (oldval)) == (LONG)oldval)
@@@ -94,6 -93,7 +93,7 @@@
      (InterlockedCompareExchangePointer(&((a)->value), (PVOID) (newval),  \
                                         (PVOID) (oldval)) == (PVOID)oldval)
  
+ #define TMPI_ATOMIC_HAVE_NATIVE_SWAP
  #define tMPI_Atomic_swap(a, b) \
      InterlockedExchange((LONG volatile *)(a), (LONG) (b))
  
  
  
  
- static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *   x)
+ static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
  {
      x->lock = 0;
  }
      InterlockedCompareExchange((LONG volatile *)(x), 1, 0)
  
  
- static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *   x)
+ static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *x)
  {
      x->lock = 0;
  }
  
  
- static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *   x)
+ static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
  {
      return (*(volatile signed char *)(&(x)->lock) != 0);
  }
  
  
- static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *   x)
+ static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
  {
      while (tMPI_Spinlock_islocked(x))
      {
index 52c39081cdac265de2500009cc2f59a6f1ebd3d3,3f8024a702325e5c27be59aa2c11179f4714320f..3f8024a702325e5c27be59aa2c11179f4714320f
@@@ -64,10 -64,6 +64,6 @@@ typedef struct tMPI_Atomic_pt
  tMPI_Atomic_ptr_t;
  
  
- #define TMPI_SPINLOCK_INITIALIZER   { 0 }
  /* for now we simply assume that int and void* assignments are atomic */
  #define tMPI_Atomic_get(a)  ((int)( (a)->value) )
  #define tMPI_Atomic_set(a, i)  (((a)->value) = (i))
  #define tMPI_Atomic_ptr_get(a)  ((void*)((a)->value) )
  #define tMPI_Atomic_ptr_set(a, i)  (((a)->value) = (void*)(i))
  
- static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, volatile int i)
- {
-     return (int) atomic_add_int_nv(&a->value, i);
- }
- static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, volatile int i)
- {
-     return (int) atomic_add_int_nv(&a->value, i) - i;
- }
  
  static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
  {
@@@ -100,64 -86,14 +86,14 @@@ static inline int tMPI_Atomic_ptr_cas(t
      return atomic_cas_ptr(&(a->value), oldval, newval) == oldval;
  }
  
- typedef struct tMPI_Spinlock
- {
-     volatile unsigned long  lock;
- } tMPI_Spinlock_t;
- #define TMPI_SPINLOCK_INITIALIZER   { 0 }
- static inline unsigned long tas(volatile unsigned long *ptr)
- {
-     unsigned long result;
-     __asm__ __volatile__("          \
-             ldstub [%1], %0         "
-                          : "=r" (result)
-                          : "r" (ptr)
-                          : "memory");
-     return result;
- }
- static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
- {
-     x->lock = 0;
- }
- static inline void tMPI_Spinlock_lock(tMPI_Spinlock_t *x)
- {
-     do
-     {
-     }
-     while (tas(&(x->lock)) == 1);
- }
- static inline int tMPI_Spinlock_trylock(tMPI_Spinlock_t *x)
- {
-     return tas(&(x->lock));
- }
- static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *  x)
- {
-     x->lock = 0;
- }
- static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *  x)
+ static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, volatile int i)
  {
-     tMPI_Atomic_memory_barrier();
-     return ( x->lock == 1 );
+     return (int) atomic_add_int_nv(&a->value, i);
  }
+ #define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
  
- static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *   x)
+ static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, volatile int i)
  {
-     do
-     {
-     }
-     while (x->lock == 1);
-     tMPI_Atomic_memory_barrier();
+     return (int) atomic_add_int_nv(&a->value, i) - i;
  }
+ #define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
index b08474a8f67acadc9110e55fde1f7a51ed53f1c1,0dd1f6753a6da2b1f00b1f93e785f95be3ac6e85..0dd1f6753a6da2b1f00b1f93e785f95be3ac6e85
@@@ -93,6 -93,7 +93,7 @@@ typedef struct tMPI_Spinloc
      volatile int lock __attribute__ ((aligned(64)));
  }
  tMPI_Spinlock_t;
+ #define TMPI_ATOMIC_HAVE_NATIVE_SPINLOCK
  
  
  
@@@ -189,6 -190,7 +190,7 @@@ static inline int tMPI_Atomic_add_retur
      return t;
  #endif
  }
+ #define TMPI_ATOMIC_HAVE_NATIVE_ADD_RETURN
  
  
  
@@@ -226,6 -228,7 +228,7 @@@ static inline int tMPI_Atomic_fetch_add
      return (t - i);
  #endif
  }
+ #define TMPI_ATOMIC_HAVE_NATIVE_FETCH_ADD
  
  
  static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
index 2c238774978648c1306704aa5d03a5266a285c85,058e3a8d9b604f1e1d13abb03d8e71ec009eabcc..058e3a8d9b604f1e1d13abb03d8e71ec009eabcc
@@@ -103,7 -103,7 +103,7 @@@ void tMPI_Event_signal(tMPI_Event *ev)
  #define tMPI_Event_signal(ev) \
      { \
          tMPI_Atomic_memory_barrier_rel(); \
-         tMPI_Atomic_add_return( &((ev)->sync), 1); \
+         tMPI_Atomic_fetch_add( &((ev)->sync), 1); \
      }
  #endif
  
index 4e026824cacc110372c6ff5dc94af93bfacefeaf,f30e8bf740e594b1f879e7f96479d67dbcb0df6c..f30e8bf740e594b1f879e7f96479d67dbcb0df6c
@@@ -100,7 -100,7 +100,7 @@@ int tMPI_Lock_trylock(tMPI_Lock_t *lock
   *  \param lock  Pointer to previously created lock.
   */
  TMPI_EXPORT
- int tMPI_Lock_islocked(const tMPI_Lock_t *lock);
+ int tMPI_Lock_islocked(tMPI_Lock_t *lock);
  
  
  
index d15ee357e044cd9caceabb9adb97ded31c418047,8d8d5fd4850387d94fc23b44b61476d53e8c848e..8d8d5fd4850387d94fc23b44b61476d53e8c848e
@@@ -103,7 -103,8 +103,8 @@@ typedef struct tMPI_Thread* tMPI_Thread
   */
  typedef struct
  {
-     tMPI_Atomic_t      initialized; /*!< Whether \a mutex has been initialized. */
+     tMPI_Atomic_t      initialized; /*!< Whether \a mutex has been
+                                        initialized. */
      struct tMPI_Mutex* mutex;       /*!< Actual mutex data structure. */
  }  tMPI_Thread_mutex_t;
  /*! \brief Static initializer for tMPI_Thread_mutex_t
   */
  typedef struct
  {
-     tMPI_Atomic_t           initialized; /*!< Whether \a key has been initialized. */
+     tMPI_Atomic_t           initialized; /*!< Whether \a key has been
+                                             initialized. */
      struct tMPI_Thread_key *key;         /*!< Actual key data structure. */
  } tMPI_Thread_key_t;
  
@@@ -178,8 -180,10 +180,10 @@@ typedef struc
   */
  typedef struct
  {
-     tMPI_Atomic_t            initialized; /*!< Whether \a condp has been initialized. */
-     struct tMPI_Thread_cond* condp;       /*!< Actual condition variable data structure. */
+     tMPI_Atomic_t            initialized; /*!< Whether \a condp has been
+                                              initialized. */
+     struct tMPI_Thread_cond* condp;       /*!< Actual condition variable data
+                                              structure. */
  } tMPI_Thread_cond_t;
  /*! \brief Static initializer for tMPI_Thread_cond_t
   *
@@@ -451,7 -455,7 +455,7 @@@ int tMPI_Thread_mutex_lock(tMPI_Thread_
   *  return code (usually meaning the mutex was already locked).
   *
   *  \param mtx  Pointer to the mutex to try and lock
-  *  \return 0 or a non-zero return error code.
+  *  \return 0 if locked, non-zero if not locked or an error occurred.
   */
  TMPI_EXPORT
  int tMPI_Thread_mutex_trylock(tMPI_Thread_mutex_t *mtx);
index 5ce8f9c0e14d3cc060e62b85e04534bdbfa0f577,d5f0e4bf4d09ac82c4015159ac9a7c1efd33d63e..d5f0e4bf4d09ac82c4015159ac9a7c1efd33d63e
@@@ -174,33 -174,36 +174,36 @@@ extern const tMPI_Datatype TMPI_POINTER
  /** Error codes */
  enum
  {
-     TMPI_SUCCESS = 0,               /*!< No error */
-     TMPI_ERR_MALLOC,                /*!< Out of memory */
-     TMPI_ERR_INIT,                  /*!< Initialization error */
-     TMPI_ERR_FINALIZE,              /*!< Finalize error */
-     TMPI_ERR_GROUP,                 /*!< Group error */
-     TMPI_ERR_COMM,                  /*!< Comm error */
-     TMPI_ERR_STATUS,                /*!< Status error */
-     TMPI_ERR_GROUP_RANK,            /*!< Group rank error */
-     TMPI_ERR_DIMS,
-     TMPI_ERR_COORDS,
-     TMPI_ERR_CART_CREATE_NPROCS,
-     TMPI_ERR_XFER_COUNTERPART,
-     TMPI_ERR_XFER_BUFSIZE,
-     TMPI_ERR_XFER_BUF_OVERLAP,
-     TMPI_ERR_SEND_DEST,
-     TMPI_ERR_RECV_SRC,
-     TMPI_ERR_BUF,
-     TMPI_ERR_MULTI_MISMATCH,
-     TMPI_ERR_OP_FN,
-     TMPI_ERR_ENVELOPES,
-     TMPI_ERR_REQUESTS,
-     TMPI_ERR_IN_STATUS,
-     TMPI_ERR_PROCNR,                /*!< Hardware processor number (such as for
-                                          thread affinity) error */
-     TMPI_FAILURE,
-     TMPI_ERR_UNKNOWN,
-     N_TMPI_ERR  /* this must be the last one */
+     TMPI_SUCCESS = 0,            /*!< No error */
+     TMPI_ERR_NO_MEM,             /*!< Out of memory */
+     TMPI_ERR_IO,                 /*!< I/O Error (used for system errors) */
+     TMPI_ERR_INIT,               /*!< Initialization error */
+     TMPI_ERR_FINALIZE,           /*!< Finalize error */
+     TMPI_ERR_GROUP,              /*!< Group error */
+     TMPI_ERR_COMM,               /*!< Comm error */
+     TMPI_ERR_STATUS,             /*!< Status error */
+     TMPI_ERR_GROUP_RANK,         /*!< Group rank error */
+     TMPI_ERR_DIMS,               /*!< Invalid topology dimensions */
+     TMPI_ERR_COORDS,             /*!< Invalid topology coordinates */
+     TMPI_ERR_CART_CREATE_NPROCS, /*!< Not enough processes for topology*/
+     TMPI_ERR_XFER_COUNTERPART,   /*!< Invalid counterpart for xfer */
+     TMPI_ERR_XFER_BUFSIZE,       /*!< buffer size too small*/
+     TMPI_ERR_XFER_BUF_OVERLAP,   /*!< buffer overlaps (thread error?)*/
+     TMPI_ERR_SEND_DEST,          /*!< Faulty send destination */
+     TMPI_ERR_RECV_SRC,           /*!< Faulty receive source */
+     TMPI_ERR_BUF,                /*!< Invalid buffer */
+     TMPI_ERR_MULTI_MISMATCH,     /*!< Comm not the same in collective call*/
+     TMPI_ERR_OP_FN,              /*!< Invalid reduce operator*/
+     TMPI_ERR_ENVELOPES,          /*!< out of envelopes (tMPI internal) */
+     TMPI_ERR_REQUESTS,           /*!< out of requests (tMPI internal) */
+     TMPI_ERR_COPY_NBUFFERS,      /*!< out of copy buffers (tMPI internal)*/
+     TMPI_ERR_COPY_BUFFER_SIZE,   /*!< copy buffer size err (tMPI internal)*/
+     TMPI_ERR_IN_STATUS,          /*!< error code in tMPI_Status */
+     TMPI_ERR_PROCNR,             /*!< Hardware processor number (such as for
+                                       thread affinity) error */
+     TMPI_FAILURE,                /*!< Transmission failure */
+     TMPI_ERR_UNKNOWN,            /*!< Unknown error */
+     N_TMPI_ERR                   /* this must be the last one */
  };
  
  /** Maximum length of error string for tMPI_Error_string() */
index 84866e8b8be962804f2ab99ccb8b845e9e32e605,0000000000000000000000000000000000000000..e5fb5c73dab3ba24bae76e35288f80da3b389b80
mode 100644,000000..100644
--- /dev/null
@@@ -1,399 -1,0 +1,399 @@@
-     F_DHDL_CON,
 +/*
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GRoups of Organic Molecules in ACtion for Science
 + */
 +
 +
 +#ifndef _idef_h
 +#define _idef_h
 +
 +#include "simple.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +
 +/* check kernel/toppush.c when you change these numbers */
 +#define MAXATOMLIST 6
 +#define MAXFORCEPARAM   12
 +#define NR_RBDIHS   6
 +#define NR_FOURDIHS     4
 +
 +typedef atom_id t_iatom;
 +
 +/* this MUST correspond to the
 +   t_interaction_function[F_NRE] in gmxlib/ifunc.c */
 +enum {
 +    F_BONDS,
 +    F_G96BONDS,
 +    F_MORSE,
 +    F_CUBICBONDS,
 +    F_CONNBONDS,
 +    F_HARMONIC,
 +    F_FENEBONDS,
 +    F_TABBONDS,
 +    F_TABBONDSNC,
 +    F_RESTRBONDS,
 +    F_ANGLES,
 +    F_G96ANGLES,
 +    F_LINEAR_ANGLES,
 +    F_CROSS_BOND_BONDS,
 +    F_CROSS_BOND_ANGLES,
 +    F_UREY_BRADLEY,
 +    F_QUARTIC_ANGLES,
 +    F_TABANGLES,
 +    F_PDIHS,
 +    F_RBDIHS,
 +    F_FOURDIHS,
 +    F_IDIHS,
 +    F_PIDIHS,
 +    F_TABDIHS,
 +    F_CMAP,
 +    F_GB12,
 +    F_GB13,
 +    F_GB14,
 +    F_GBPOL,
 +    F_NPSOLVATION,
 +    F_LJ14,
 +    F_COUL14,
 +    F_LJC14_Q,
 +    F_LJC_PAIRS_NB,
 +    F_LJ,
 +    F_BHAM,
 +    F_LJ_LR,
 +    F_BHAM_LR,
 +    F_DISPCORR,
 +    F_COUL_SR,
 +    F_COUL_LR,
 +    F_RF_EXCL,
 +    F_COUL_RECIP,
 +    F_DPD,
 +    F_POLARIZATION,
 +    F_WATER_POL,
 +    F_THOLE_POL,
 +    F_ANHARM_POL,
 +    F_POSRES,
 +    F_FBPOSRES,
 +    F_DISRES,
 +    F_DISRESVIOL,
 +    F_ORIRES,
 +    F_ORIRESDEV,
 +    F_ANGRES,
 +    F_ANGRESZ,
 +    F_DIHRES,
 +    F_DIHRESVIOL,
 +    F_CONSTR,
 +    F_CONSTRNC,
 +    F_SETTLE,
 +    F_VSITE2,
 +    F_VSITE3,
 +    F_VSITE3FD,
 +    F_VSITE3FAD,
 +    F_VSITE3OUT,
 +    F_VSITE4FD,
 +    F_VSITE4FDN,
 +    F_VSITEN,
 +    F_COM_PULL,
 +    F_EQM,
 +    F_EPOT,
 +    F_EKIN,
 +    F_ETOT,
 +    F_ECONSERVED,
 +    F_TEMP,
 +    F_VTEMP_NOLONGERUSED,
 +    F_PDISPCORR,
 +    F_PRES,
++    F_DVDL_CONSTR,
 +    F_DVDL,
 +    F_DKDL,
 +    F_DVDL_COUL,
 +    F_DVDL_VDW,
 +    F_DVDL_BONDED,
 +    F_DVDL_RESTRAINT,
 +    F_DVDL_TEMPERATURE, /* not calculated for now, but should just be the energy (NVT) or enthalpy (NPT), or 0 (NVE) */
 +    F_NRE               /* This number is for the total number of energies    */
 +};
 +
 +#define IS_RESTRAINT_TYPE(ifunc) (((ifunc == F_POSRES) || (ifunc == F_DISRES) || (ifunc == F_RESTRBONDS) || (ifunc == F_DISRESVIOL) || (ifunc == F_ORIRES) || (ifunc == F_ORIRESDEV) || (ifunc == F_ANGRES) || (ifunc == F_ANGRESZ) || (ifunc == F_DIHRES)))
 +
 +/* A macro for checking if ftype is an explicit pair-listed LJ or COULOMB
 + * interaction type:
 + * bonded LJ (usually 1-4), or special listed non-bonded for FEP.
 + */
 +#define IS_LISTED_LJ_C(ftype) ((ftype) >= F_LJ14 && (ftype) <= F_LJC_PAIRS_NB)
 +
 +typedef union
 +{
 +    /* Some parameters have A and B values for free energy calculations.
 +     * The B values are not used for regular simulations of course.
 +     * Free Energy for nonbondeds can be computed by changing the atom type.
 +     * The harmonic type is used for all harmonic potentials:
 +     * bonds, angles and improper dihedrals
 +     */
 +    struct {
 +        real a, b, c;
 +    } bham;
 +    struct {
 +        real rA, krA, rB, krB;
 +    } harmonic;
 +    struct {
 +        real klinA, aA, klinB, aB;
 +    } linangle;
 +    struct {
 +        real lowA, up1A, up2A, kA, lowB, up1B, up2B, kB;
 +    } restraint;
 +    /* No free energy supported for cubic bonds, FENE, WPOL or cross terms */
 +    struct {
 +        real b0, kb, kcub;
 +    } cubic;
 +    struct {
 +        real bm, kb;
 +    } fene;
 +    struct {
 +        real r1e, r2e, krr;
 +    } cross_bb;
 +    struct {
 +        real r1e, r2e, r3e, krt;
 +    } cross_ba;
 +    struct {
 +        real thetaA, kthetaA, r13A, kUBA, thetaB, kthetaB, r13B, kUBB;
 +    } u_b;
 +    struct {
 +        real theta, c[5];
 +    } qangle;
 +    struct {
 +        real alpha;
 +    } polarize;
 +    struct {
 +        real alpha, drcut, khyp;
 +    } anharm_polarize;
 +    struct {
 +        real al_x, al_y, al_z, rOH, rHH, rOD;
 +    } wpol;
 +    struct {
 +        real a, alpha1, alpha2, rfac;
 +    } thole;
 +    struct {
 +        real c6, c12;
 +    } lj;
 +    struct {
 +        real c6A, c12A, c6B, c12B;
 +    } lj14;
 +    struct {
 +        real fqq, qi, qj, c6, c12;
 +    } ljc14;
 +    struct {
 +        real qi, qj, c6, c12;
 +    } ljcnb;
 +    /* Proper dihedrals can not have different multiplicity when
 +     * doing free energy calculations, because the potential would not
 +     * be periodic anymore.
 +     */
 +    struct {
 +        real phiA, cpA; int mult; real phiB, cpB;
 +    } pdihs;
 +    struct {
 +        real dA, dB;
 +    } constr;
 +    /* Settle can not be used for Free energy calculations of water bond geometry.
 +     * Use shake (or lincs) instead if you have to change the water bonds.
 +     */
 +    struct {
 +        real doh, dhh;
 +    } settle;
 +    struct {
 +        real b0A, cbA, betaA, b0B, cbB, betaB;
 +    } morse;
 +    struct {
 +        real pos0A[DIM], fcA[DIM], pos0B[DIM], fcB[DIM];
 +    } posres;
 +    struct {
 +        real pos0[DIM], r, k; int geom;
 +    } fbposres;
 +    struct {
 +        real rbcA[NR_RBDIHS], rbcB[NR_RBDIHS];
 +    } rbdihs;
 +    struct {
 +        real a, b, c, d, e, f;
 +    } vsite;
 +    struct {
 +        int  n; real a;
 +    } vsiten;
 +    /* NOTE: npair is only set after reading the tpx file */
 +    struct {
 +        real low, up1, up2, kfac; int type, label, npair;
 +    } disres;
 +    struct {
 +        real phiA, dphiA, kfacA, phiB, dphiB, kfacB;
 +    } dihres;
 +    struct {
 +        int  ex, power, label; real c, obs, kfac;
 +    } orires;
 +    struct {
 +        int  table; real kA; real kB;
 +    } tab;
 +    struct {
 +        real sar, st, pi, gbr, bmlt;
 +    } gb;
 +    struct {
 +        int cmapA, cmapB;
 +    } cmap;
 +    struct {
 +        real buf[MAXFORCEPARAM];
 +    } generic;                                               /* Conversion */
 +} t_iparams;
 +
 +typedef int t_functype;
 +
 +/*
 + * The nonperturbed/perturbed interactions are now separated (sorted) in the
 + * ilist, such that the first 0..(nr_nonperturbed-1) ones are exactly that, and
 + * the remaining ones from nr_nonperturbed..(nr-1) are perturbed bonded
 + * interactions.
 + */
 +typedef struct
 +{
 +    int      nr;
 +    int      nr_nonperturbed;
 +    t_iatom *iatoms;
 +    int      nalloc;
 +} t_ilist;
 +
 +/*
 + * The struct t_ilist defines a list of atoms with their interactions.
 + * General field description:
 + *   int nr
 + *    the size (nr elements) of the interactions array (iatoms[]).
 + *   t_iatom *iatoms
 + *  specifies which atoms are involved in an interaction of a certain
 + *       type. The layout of this array is as follows:
 + *
 + *      +-----+---+---+---+-----+---+---+-----+---+---+---+-----+---+---+...
 + *      |type1|at1|at2|at3|type2|at1|at2|type1|at1|at2|at3|type3|at1|at2|
 + *      +-----+---+---+---+-----+---+---+-----+---+---+---+-----+---+---+...
 + *
 + *  So for interaction type type1 3 atoms are needed, and for type2 and
 + *      type3 only 2. The type identifier is used to select the function to
 + *    calculate the interaction and its actual parameters. This type
 + *    identifier is an index in a params[] and functype[] array.
 + */
 +
 +typedef struct
 +{
 +    real *cmap; /* Has length 4*grid_spacing*grid_spacing, */
 +    /* there are 4 entries for each cmap type (V,dVdx,dVdy,d2dVdxdy) */
 +} cmapdata_t;
 +
 +typedef struct
 +{
 +    int         ngrid;        /* Number of allocated cmap (cmapdata_t ) grids */
 +    int         grid_spacing; /* Grid spacing */
 +    cmapdata_t *cmapdata;     /* Pointer to grid with actual, pre-interpolated data */
 +} gmx_cmap_t;
 +
 +
 +typedef struct
 +{
 +    int         ntypes;
 +    int         atnr;
 +    t_functype *functype;
 +    t_iparams  *iparams;
 +    double      reppow;    /* The repulsion power for VdW: C12*r^-reppow   */
 +    real        fudgeQQ;   /* The scaling factor for Coulomb 1-4: f*q1*q2  */
 +    gmx_cmap_t  cmap_grid; /* The dihedral correction maps                 */
 +} gmx_ffparams_t;
 +
 +enum {
 +    ilsortUNKNOWN, ilsortNO_FE, ilsortFE_UNSORTED, ilsortFE_SORTED
 +};
 +
 +typedef struct
 +{
 +    int         ntypes;
 +    int         atnr;
 +    t_functype *functype;
 +    t_iparams  *iparams;
 +    real        fudgeQQ;
 +    gmx_cmap_t  cmap_grid;
 +    t_iparams  *iparams_posres, *iparams_fbposres;
 +    int         iparams_posres_nalloc, iparams_fbposres_nalloc;
 +
 +    t_ilist     il[F_NRE];
 +    int         ilsort;
 +} t_idef;
 +
 +/*
 + * The struct t_idef defines all the interactions for the complete
 + * simulation. The structure is setup in such a way that the multinode
 + * version of the program  can use it as easy as the single node version.
 + * General field description:
 + *   int ntypes
 + *    defines the number of elements in functype[] and param[].
 + *   int nodeid
 + *      the node id (if parallel machines)
 + *   int atnr
 + *      the number of atomtypes
 + *   t_functype *functype
 + *    array of length ntypes, defines for every force type what type of
 + *      function to use. Every "bond" with the same function but different
 + *    force parameters is a different force type. The type identifier in the
 + *    forceatoms[] array is an index in this array.
 + *   t_iparams *iparams
 + *    array of length ntypes, defines the parameters for every interaction
 + *      type. The type identifier in the actual interaction list
 + *      (ilist[ftype].iatoms[]) is an index in this array.
 + *   gmx_cmap_t cmap_grid
 + *      the grid for the dihedral pair correction maps.
 + *   t_iparams *iparams_posres, *iparams_fbposres
 + *    defines the parameters for position restraints only.
 + *      Position restraints are the only interactions that have different
 + *      parameters (reference positions) for different molecules
 + *      of the same type. ilist[F_POSRES].iatoms[] is an index in this array.
 + *   t_ilist il[F_NRE]
 + *      The list of interactions for each type. Note that some,
 + *      such as LJ and COUL will have 0 entries.
 + */
 +
 +typedef struct {
 +    int   n;      /* n+1 is the number of points */
 +    real  scale;  /* distance between two points */
 +    real *data;   /* the actual table data, per point there are 4 numbers */
 +} bondedtable_t;
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +
 +#endif
index 40a1c5d22e81a9ae67cdf1bfd20edb2b2ac0deea,0000000000000000000000000000000000000000..278c8de8f388246eed1a626ea9393b764ceee60a
mode 100644,000000..100644
--- /dev/null
@@@ -1,954 -1,0 +1,956 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include <assert.h>
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "macros.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "physics.h"
 +#include "force.h"
 +#include "nonbonded.h"
 +#include "names.h"
 +#include "network.h"
 +#include "pbc.h"
 +#include "ns.h"
 +#include "nrnb.h"
 +#include "bondf.h"
 +#include "mshift.h"
 +#include "txtdump.h"
 +#include "coulomb.h"
 +#include "pme.h"
 +#include "mdrun.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "qmmm.h"
 +#include "gmx_omp_nthreads.h"
 +
 +
 +void ns(FILE              *fp,
 +        t_forcerec        *fr,
 +        rvec               x[],
 +        matrix             box,
 +        gmx_groups_t      *groups,
 +        t_grpopts         *opts,
 +        gmx_localtop_t    *top,
 +        t_mdatoms         *md,
 +        t_commrec         *cr,
 +        t_nrnb            *nrnb,
 +        real              *lambda,
 +        real              *dvdlambda,
 +        gmx_grppairener_t *grppener,
 +        gmx_bool           bFillGrid,
 +        gmx_bool           bDoLongRangeNS)
 +{
 +    char   *ptr;
 +    int     nsearch;
 +
 +
 +    if (!fr->ns.nblist_initialized)
 +    {
 +        init_neighbor_list(fp, fr, md->homenr);
 +    }
 +
 +    if (fr->bTwinRange)
 +    {
 +        fr->nlr = 0;
 +    }
 +
 +    nsearch = search_neighbours(fp, fr, x, box, top, groups, cr, nrnb, md,
 +                                lambda, dvdlambda, grppener,
 +                                bFillGrid, bDoLongRangeNS, TRUE);
 +    if (debug)
 +    {
 +        fprintf(debug, "nsearch = %d\n", nsearch);
 +    }
 +
 +    /* Check whether we have to do dynamic load balancing */
 +    /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
 +       count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
 +       &(top->idef),opts->ngener);
 +     */
 +    if (fr->ns.dump_nl > 0)
 +    {
 +        dump_nblist(fp, cr, fr, fr->ns.dump_nl);
 +    }
 +}
 +
 +static void reduce_thread_forces(int n, rvec *f,
 +                                 tensor vir,
 +                                 real *Vcorr,
 +                                 int efpt_ind, real *dvdl,
 +                                 int nthreads, f_thread_t *f_t)
 +{
 +    int t, i;
 +
 +    /* This reduction can run over any number of threads */
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntBonded)) private(t) schedule(static)
 +    for (i = 0; i < n; i++)
 +    {
 +        for (t = 1; t < nthreads; t++)
 +        {
 +            rvec_inc(f[i], f_t[t].f[i]);
 +        }
 +    }
 +    for (t = 1; t < nthreads; t++)
 +    {
 +        *Vcorr += f_t[t].Vcorr;
 +        *dvdl  += f_t[t].dvdl[efpt_ind];
 +        m_add(vir, f_t[t].vir, vir);
 +    }
 +}
 +
 +void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
 +                       t_forcerec *fr,      t_inputrec *ir,
 +                       t_idef     *idef,    t_commrec  *cr,
 +                       t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
 +                       t_mdatoms  *md,
 +                       t_grpopts  *opts,
 +                       rvec       x[],      history_t  *hist,
 +                       rvec       f[],
 +                       rvec       f_longrange[],
 +                       gmx_enerdata_t *enerd,
 +                       t_fcdata   *fcd,
 +                       gmx_mtop_t     *mtop,
 +                       gmx_localtop_t *top,
 +                       gmx_genborn_t *born,
 +                       t_atomtypes *atype,
 +                       gmx_bool       bBornRadii,
 +                       matrix     box,
 +                       t_lambda   *fepvals,
 +                       real       *lambda,
 +                       t_graph    *graph,
 +                       t_blocka   *excl,
 +                       rvec       mu_tot[],
 +                       int        flags,
 +                       float      *cycles_pme)
 +{
 +    int         i, j, status;
 +    int         donb_flags;
 +    gmx_bool    bDoEpot, bSepDVDL, bSB;
 +    int         pme_flags;
 +    matrix      boxs;
 +    rvec        box_size;
 +    real        Vsr, Vlr, Vcorr = 0;
 +    t_pbc       pbc;
 +    real        dvdgb;
 +    char        buf[22];
 +    double      clam_i, vlam_i;
 +    real        dvdl_dum[efptNR], dvdl, dvdl_nb[efptNR], lam_i[efptNR];
 +    real        dvdlsum;
 +
 +#ifdef GMX_MPI
 +    double  t0 = 0.0, t1, t2, t3; /* time measurement for coarse load balancing */
 +#endif
 +
 +#define PRINT_SEPDVDL(s, v, dvdlambda) if (bSepDVDL) {fprintf(fplog, sepdvdlformat, s, v, dvdlambda); }
 +
 +
 +    set_pbc(&pbc, fr->ePBC, box);
 +
 +    /* reset free energy components */
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        dvdl_nb[i]  = 0;
 +        dvdl_dum[i] = 0;
 +    }
 +
 +    /* Reset box */
 +    for (i = 0; (i < DIM); i++)
 +    {
 +        box_size[i] = box[i][i];
 +    }
 +
 +    bSepDVDL = (fr->bSepDVDL && do_per_step(step, ir->nstlog));
 +    debug_gmx();
 +
 +    /* do QMMM first if requested */
 +    if (fr->bQMMM)
 +    {
 +        enerd->term[F_EQM] = calculate_QMMM(cr, x, f, fr, md);
 +    }
 +
 +    if (bSepDVDL)
 +    {
 +        fprintf(fplog, "Step %s: non-bonded V and dVdl for node %d:\n",
 +                gmx_step_str(step, buf), cr->nodeid);
 +    }
 +
 +    /* Call the short range functions all in one go. */
 +
 +#ifdef GMX_MPI
 +    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
 +#define TAKETIME FALSE
 +    if (TAKETIME)
 +    {
 +        MPI_Barrier(cr->mpi_comm_mygroup);
 +        t0 = MPI_Wtime();
 +    }
 +#endif
 +
 +    if (ir->nwall)
 +    {
 +        /* foreign lambda component for walls */
 +        dvdl = do_walls(ir, fr, box, md, x, f, lambda[efptVDW],
 +                        enerd->grpp.ener[egLJSR], nrnb);
 +        PRINT_SEPDVDL("Walls", 0.0, dvdl);
 +        enerd->dvdl_lin[efptVDW] += dvdl;
 +    }
 +
 +    /* If doing GB, reset dvda and calculate the Born radii */
 +    if (ir->implicit_solvent)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsNONBONDED);
 +
 +        for (i = 0; i < born->nr; i++)
 +        {
 +            fr->dvda[i] = 0;
 +        }
 +
 +        if (bBornRadii)
 +        {
 +            calc_gb_rad(cr, fr, ir, top, atype, x, &(fr->gblist), born, md, nrnb);
 +        }
 +
 +        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 +    }
 +
 +    where();
 +    /* We only do non-bonded calculation with group scheme here, the verlet
 +     * calls are done from do_force_cutsVERLET(). */
 +    if (fr->cutoff_scheme == ecutsGROUP && (flags & GMX_FORCE_NONBONDED))
 +    {
 +        donb_flags = 0;
 +        /* Add short-range interactions */
 +        donb_flags |= GMX_NONBONDED_DO_SR;
 +
 +        if (flags & GMX_FORCE_FORCES)
 +        {
 +            donb_flags |= GMX_NONBONDED_DO_FORCE;
 +        }
 +        if (flags & GMX_FORCE_ENERGY)
 +        {
 +            donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
 +        }
 +        if (flags & GMX_FORCE_DO_LR)
 +        {
 +            donb_flags |= GMX_NONBONDED_DO_LR;
 +        }
 +
 +        wallcycle_sub_start(wcycle, ewcsNONBONDED);
 +        do_nonbonded(cr, fr, x, f, f_longrange, md, excl,
 +                     &enerd->grpp, box_size, nrnb,
 +                     lambda, dvdl_nb, -1, -1, donb_flags);
 +
 +        /* If we do foreign lambda and we have soft-core interactions
 +         * we have to recalculate the (non-linear) energies contributions.
 +         */
 +        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 +        {
 +            for (i = 0; i < enerd->n_lambda; i++)
 +            {
 +                for (j = 0; j < efptNR; j++)
 +                {
 +                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 +                }
 +                reset_foreign_enerdata(enerd);
 +                do_nonbonded(cr, fr, x, f, f_longrange, md, excl,
 +                             &(enerd->foreign_grpp), box_size, nrnb,
 +                             lam_i, dvdl_dum, -1, -1,
 +                             (donb_flags & ~GMX_NONBONDED_DO_FORCE) | GMX_NONBONDED_DO_FOREIGNLAMBDA);
 +                sum_epot(&ir->opts, &(enerd->foreign_grpp), enerd->foreign_term);
 +                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 +            }
 +        }
 +        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 +        where();
 +    }
 +
 +    /* If we are doing GB, calculate bonded forces and apply corrections
 +     * to the solvation forces */
 +    /* MRS: Eventually, many need to include free energy contribution here! */
 +    if (ir->implicit_solvent)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsBONDED);
 +        calc_gb_forces(cr, md, born, top, atype, x, f, fr, idef,
 +                       ir->gb_algorithm, ir->sa_algorithm, nrnb, bBornRadii, &pbc, graph, enerd);
 +        wallcycle_sub_stop(wcycle, ewcsBONDED);
 +    }
 +
 +#ifdef GMX_MPI
 +    if (TAKETIME)
 +    {
 +        t1          = MPI_Wtime();
 +        fr->t_fnbf += t1-t0;
 +    }
 +#endif
 +
 +    if (fepvals->sc_alpha != 0)
 +    {
 +        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
 +    }
 +    else
 +    {
 +        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
 +    }
 +
 +    if (fepvals->sc_alpha != 0)
 +
 +    /* even though coulomb part is linear, we already added it, beacuse we
 +       need to go through the vdw calculation anyway */
 +    {
 +        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 +    }
 +    else
 +    {
 +        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 +    }
 +
 +    Vsr = 0;
 +    if (bSepDVDL)
 +    {
 +        for (i = 0; i < enerd->grpp.nener; i++)
 +        {
 +            Vsr +=
 +                (fr->bBHAM ?
 +                 enerd->grpp.ener[egBHAMSR][i] :
 +                 enerd->grpp.ener[egLJSR][i])
 +                + enerd->grpp.ener[egCOULSR][i] + enerd->grpp.ener[egGB][i];
 +        }
 +        dvdlsum = dvdl_nb[efptVDW] + dvdl_nb[efptCOUL];
 +        PRINT_SEPDVDL("VdW and Coulomb SR particle-p.", Vsr, dvdlsum);
 +    }
 +    debug_gmx();
 +
 +
 +    if (debug)
 +    {
 +        pr_rvecs(debug, 0, "fshift after SR", fr->fshift, SHIFTS);
 +    }
 +
 +    /* Shift the coordinates. Must be done before bonded forces and PPPM,
 +     * but is also necessary for SHAKE and update, therefore it can NOT
 +     * go when no bonded forces have to be evaluated.
 +     */
 +
 +    /* Here sometimes we would not need to shift with NBFonly,
 +     * but we do so anyhow for consistency of the returned coordinates.
 +     */
 +    if (graph)
 +    {
 +        shift_self(graph, box, x);
 +        if (TRICLINIC(box))
 +        {
 +            inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
 +        }
 +        else
 +        {
 +            inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
 +        }
 +    }
 +    /* Check whether we need to do bondeds or correct for exclusions */
 +    if (fr->bMolPBC &&
 +        ((flags & GMX_FORCE_BONDED)
 +         || EEL_RF(fr->eeltype) || EEL_FULL(fr->eeltype)))
 +    {
 +        /* Since all atoms are in the rectangular or triclinic unit-cell,
 +         * only single box vector shifts (2 in x) are required.
 +         */
 +        set_pbc_dd(&pbc, fr->ePBC, cr->dd, TRUE, box);
 +    }
 +    debug_gmx();
 +
 +    if (flags & GMX_FORCE_BONDED)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsBONDED);
 +        calc_bonds(fplog, cr->ms,
 +                   idef, x, hist, f, fr, &pbc, graph, enerd, nrnb, lambda, md, fcd,
 +                   DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL, atype, born,
 +                   flags,
 +                   fr->bSepDVDL && do_per_step(step, ir->nstlog), step);
 +
 +        /* Check if we have to determine energy differences
 +         * at foreign lambda's.
 +         */
 +        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) &&
 +            idef->ilsort != ilsortNO_FE)
 +        {
 +            if (idef->ilsort != ilsortFE_SORTED)
 +            {
 +                gmx_incons("The bonded interactions are not sorted for free energy");
 +            }
 +            for (i = 0; i < enerd->n_lambda; i++)
 +            {
 +                reset_foreign_enerdata(enerd);
 +                for (j = 0; j < efptNR; j++)
 +                {
 +                    lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 +                }
 +                calc_bonds_lambda(fplog, idef, x, fr, &pbc, graph, &(enerd->foreign_grpp), enerd->foreign_term, nrnb, lam_i, md,
 +                                  fcd, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
 +                sum_epot(&ir->opts, &(enerd->foreign_grpp), enerd->foreign_term);
 +                enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
 +            }
 +        }
 +        debug_gmx();
 +
 +        wallcycle_sub_stop(wcycle, ewcsBONDED);
 +    }
 +
 +    where();
 +
 +    *cycles_pme = 0;
 +    if (EEL_FULL(fr->eeltype))
 +    {
 +        bSB = (ir->nwall == 2);
 +        if (bSB)
 +        {
 +            copy_mat(box, boxs);
 +            svmul(ir->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
 +            box_size[ZZ] *= ir->wall_ewald_zfac;
 +        }
 +
 +        clear_mat(fr->vir_el_recip);
 +
 +        if (fr->bEwald)
 +        {
 +            Vcorr = 0;
 +            dvdl  = 0;
 +
 +            /* With the Verlet scheme exclusion forces are calculated
 +             * in the non-bonded kernel.
 +             */
 +            /* The TPI molecule does not have exclusions with the rest
 +             * of the system and no intra-molecular PME grid contributions
 +             * will be calculated in gmx_pme_calc_energy.
 +             */
 +            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
 +                ir->ewald_geometry != eewg3D ||
 +                ir->epsilon_surface != 0)
 +            {
 +                int nthreads, t;
 +
 +                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
 +
 +                if (fr->n_tpi > 0)
 +                {
 +                    gmx_fatal(FARGS, "TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
 +                }
 +
 +                nthreads = gmx_omp_nthreads_get(emntBonded);
 +#pragma omp parallel for num_threads(nthreads) schedule(static)
 +                for (t = 0; t < nthreads; t++)
 +                {
 +                    int     s, e, i;
 +                    rvec   *fnv;
 +                    tensor *vir;
 +                    real   *Vcorrt, *dvdlt;
 +                    if (t == 0)
 +                    {
 +                        fnv    = fr->f_novirsum;
 +                        vir    = &fr->vir_el_recip;
 +                        Vcorrt = &Vcorr;
 +                        dvdlt  = &dvdl;
 +                    }
 +                    else
 +                    {
 +                        fnv    = fr->f_t[t].f;
 +                        vir    = &fr->f_t[t].vir;
 +                        Vcorrt = &fr->f_t[t].Vcorr;
 +                        dvdlt  = &fr->f_t[t].dvdl[efptCOUL];
 +                        for (i = 0; i < fr->natoms_force; i++)
 +                        {
 +                            clear_rvec(fnv[i]);
 +                        }
 +                        clear_mat(*vir);
 +                    }
 +                    *dvdlt  = 0;
 +                    *Vcorrt =
 +                        ewald_LRcorrection(fplog,
 +                                           fr->excl_load[t], fr->excl_load[t+1],
 +                                           cr, t, fr,
 +                                           md->chargeA,
 +                                           md->nChargePerturbed ? md->chargeB : NULL,
 +                                           ir->cutoff_scheme != ecutsVERLET,
 +                                           excl, x, bSB ? boxs : box, mu_tot,
 +                                           ir->ewald_geometry,
 +                                           ir->epsilon_surface,
 +                                           fnv, *vir,
 +                                           lambda[efptCOUL], dvdlt);
 +                }
 +                if (nthreads > 1)
 +                {
 +                    reduce_thread_forces(fr->natoms_force, fr->f_novirsum,
 +                                         fr->vir_el_recip,
 +                                         &Vcorr, efptCOUL, &dvdl,
 +                                         nthreads, fr->f_t);
 +                }
 +
 +                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
 +            }
 +
 +            if (fr->n_tpi == 0)
 +            {
 +                Vcorr += ewald_charge_correction(cr, fr, lambda[efptCOUL], box,
 +                                                 &dvdl, fr->vir_el_recip);
 +            }
 +
 +            PRINT_SEPDVDL("Ewald excl./charge/dip. corr.", Vcorr, dvdl);
 +            enerd->dvdl_lin[efptCOUL] += dvdl;
 +        }
 +
 +        status = 0;
 +        Vlr    = 0;
 +        dvdl   = 0;
 +        switch (fr->eeltype)
 +        {
 +            case eelPME:
 +            case eelPMESWITCH:
 +            case eelPMEUSER:
 +            case eelPMEUSERSWITCH:
 +            case eelP3M_AD:
 +                if (cr->duty & DUTY_PME)
 +                {
 +                    assert(fr->n_tpi >= 0);
 +                    if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
 +                    {
 +                        pme_flags = GMX_PME_SPREAD_Q | GMX_PME_SOLVE;
 +                        if (flags & GMX_FORCE_FORCES)
 +                        {
 +                            pme_flags |= GMX_PME_CALC_F;
 +                        }
 +                        if (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY))
 +                        {
 +                            pme_flags |= GMX_PME_CALC_ENER_VIR;
 +                        }
 +                        if (fr->n_tpi > 0)
 +                        {
 +                            /* We don't calculate f, but we do want the potential */
 +                            pme_flags |= GMX_PME_CALC_POT;
 +                        }
 +                        wallcycle_start(wcycle, ewcPMEMESH);
 +                        status = gmx_pme_do(fr->pmedata,
 +                                            md->start, md->homenr - fr->n_tpi,
 +                                            x, fr->f_novirsum,
 +                                            md->chargeA, md->chargeB,
 +                                            bSB ? boxs : box, cr,
 +                                            DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
 +                                            DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
 +                                            nrnb, wcycle,
 +                                            fr->vir_el_recip, fr->ewaldcoeff,
 +                                            &Vlr, lambda[efptCOUL], &dvdl,
 +                                            pme_flags);
 +                        *cycles_pme = wallcycle_stop(wcycle, ewcPMEMESH);
 +
 +                        /* We should try to do as little computation after
 +                         * this as possible, because parallel PME synchronizes
 +                         * the nodes, so we want all load imbalance of the rest
 +                         * of the force calculation to be before the PME call.
 +                         * DD load balancing is done on the whole time of
 +                         * the force call (without PME).
 +                         */
 +                    }
 +                    if (fr->n_tpi > 0)
 +                    {
 +                        /* Determine the PME grid energy of the test molecule
 +                         * with the PME grid potential of the other charges.
 +                         */
 +                        gmx_pme_calc_energy(fr->pmedata, fr->n_tpi,
 +                                            x + md->homenr - fr->n_tpi,
 +                                            md->chargeA + md->homenr - fr->n_tpi,
 +                                            &Vlr);
 +                    }
 +                    PRINT_SEPDVDL("PME mesh", Vlr, dvdl);
 +                }
 +                break;
 +            case eelEWALD:
 +                Vlr = do_ewald(fplog, FALSE, ir, x, fr->f_novirsum,
 +                               md->chargeA, md->chargeB,
 +                               box_size, cr, md->homenr,
 +                               fr->vir_el_recip, fr->ewaldcoeff,
 +                               lambda[efptCOUL], &dvdl, fr->ewald_table);
 +                PRINT_SEPDVDL("Ewald long-range", Vlr, dvdl);
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "No such electrostatics method implemented %s",
 +                          eel_names[fr->eeltype]);
 +        }
 +        if (status != 0)
 +        {
 +            gmx_fatal(FARGS, "Error %d in long range electrostatics routine %s",
 +                      status, EELTYPE(fr->eeltype));
 +        }
 +        /* Note that with separate PME nodes we get the real energies later */
 +        enerd->dvdl_lin[efptCOUL] += dvdl;
 +        enerd->term[F_COUL_RECIP]  = Vlr + Vcorr;
 +        if (debug)
 +        {
 +            fprintf(debug, "Vlr = %g, Vcorr = %g, Vlr_corr = %g\n",
 +                    Vlr, Vcorr, enerd->term[F_COUL_RECIP]);
 +            pr_rvecs(debug, 0, "vir_el_recip after corr", fr->vir_el_recip, DIM);
 +            pr_rvecs(debug, 0, "fshift after LR Corrections", fr->fshift, SHIFTS);
 +        }
 +    }
 +    else
 +    {
 +        if (EEL_RF(fr->eeltype))
 +        {
 +            /* With the Verlet scheme exclusion forces are calculated
 +             * in the non-bonded kernel.
 +             */
 +            if (ir->cutoff_scheme != ecutsVERLET && fr->eeltype != eelRF_NEC)
 +            {
 +                dvdl                   = 0;
 +                enerd->term[F_RF_EXCL] =
 +                    RF_excl_correction(fplog, fr, graph, md, excl, x, f,
 +                                       fr->fshift, &pbc, lambda[efptCOUL], &dvdl);
 +            }
 +
 +            enerd->dvdl_lin[efptCOUL] += dvdl;
 +            PRINT_SEPDVDL("RF exclusion correction",
 +                          enerd->term[F_RF_EXCL], dvdl);
 +        }
 +    }
 +    where();
 +    debug_gmx();
 +
 +    if (debug)
 +    {
 +        print_nrnb(debug, nrnb);
 +    }
 +    debug_gmx();
 +
 +#ifdef GMX_MPI
 +    if (TAKETIME)
 +    {
 +        t2 = MPI_Wtime();
 +        MPI_Barrier(cr->mpi_comm_mygroup);
 +        t3          = MPI_Wtime();
 +        fr->t_wait += t3-t2;
 +        if (fr->timesteps == 11)
 +        {
 +            fprintf(stderr, "* PP load balancing info: node %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
 +                    cr->nodeid, gmx_step_str(fr->timesteps, buf),
 +                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
 +                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
 +        }
 +        fr->timesteps++;
 +    }
 +#endif
 +
 +    if (debug)
 +    {
 +        pr_rvecs(debug, 0, "fshift after bondeds", fr->fshift, SHIFTS);
 +    }
 +
 +}
 +
 +void init_enerdata(int ngener, int n_lambda, gmx_enerdata_t *enerd)
 +{
 +    int i, n2;
 +
 +    for (i = 0; i < F_NRE; i++)
 +    {
 +        enerd->term[i]         = 0;
 +        enerd->foreign_term[i] = 0;
 +    }
 +
 +
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        enerd->dvdl_lin[i]     = 0;
 +        enerd->dvdl_nonlin[i]  = 0;
 +    }
 +
 +    n2 = ngener*ngener;
 +    if (debug)
 +    {
 +        fprintf(debug, "Creating %d sized group matrix for energies\n", n2);
 +    }
 +    enerd->grpp.nener         = n2;
 +    enerd->foreign_grpp.nener = n2;
 +    for (i = 0; (i < egNR); i++)
 +    {
 +        snew(enerd->grpp.ener[i], n2);
 +        snew(enerd->foreign_grpp.ener[i], n2);
 +    }
 +
 +    if (n_lambda)
 +    {
 +        enerd->n_lambda = 1 + n_lambda;
 +        snew(enerd->enerpart_lambda, enerd->n_lambda);
 +    }
 +    else
 +    {
 +        enerd->n_lambda = 0;
 +    }
 +}
 +
 +void destroy_enerdata(gmx_enerdata_t *enerd)
 +{
 +    int i;
 +
 +    for (i = 0; (i < egNR); i++)
 +    {
 +        sfree(enerd->grpp.ener[i]);
 +    }
 +
 +    for (i = 0; (i < egNR); i++)
 +    {
 +        sfree(enerd->foreign_grpp.ener[i]);
 +    }
 +
 +    if (enerd->n_lambda)
 +    {
 +        sfree(enerd->enerpart_lambda);
 +    }
 +}
 +
 +static real sum_v(int n, real v[])
 +{
 +    real t;
 +    int  i;
 +
 +    t = 0.0;
 +    for (i = 0; (i < n); i++)
 +    {
 +        t = t + v[i];
 +    }
 +
 +    return t;
 +}
 +
 +void sum_epot(t_grpopts *opts, gmx_grppairener_t *grpp, real *epot)
 +{
 +    int i;
 +
 +    /* Accumulate energies */
 +    epot[F_COUL_SR]  = sum_v(grpp->nener, grpp->ener[egCOULSR]);
 +    epot[F_LJ]       = sum_v(grpp->nener, grpp->ener[egLJSR]);
 +    epot[F_LJ14]     = sum_v(grpp->nener, grpp->ener[egLJ14]);
 +    epot[F_COUL14]   = sum_v(grpp->nener, grpp->ener[egCOUL14]);
 +    epot[F_COUL_LR]  = sum_v(grpp->nener, grpp->ener[egCOULLR]);
 +    epot[F_LJ_LR]    = sum_v(grpp->nener, grpp->ener[egLJLR]);
 +    /* We have already added 1-2,1-3, and 1-4 terms to F_GBPOL */
 +    epot[F_GBPOL]   += sum_v(grpp->nener, grpp->ener[egGB]);
 +
 +/* lattice part of LR doesnt belong to any group
 + * and has been added earlier
 + */
 +    epot[F_BHAM]     = sum_v(grpp->nener, grpp->ener[egBHAMSR]);
 +    epot[F_BHAM_LR]  = sum_v(grpp->nener, grpp->ener[egBHAMLR]);
 +
 +    epot[F_EPOT] = 0;
 +    for (i = 0; (i < F_EPOT); i++)
 +    {
 +        if (i != F_DISRESVIOL && i != F_ORIRESDEV)
 +        {
 +            epot[F_EPOT] += epot[i];
 +        }
 +    }
 +}
 +
 +void sum_dhdl(gmx_enerdata_t *enerd, real *lambda, t_lambda *fepvals)
 +{
 +    int    i, j, index;
 +    double dlam;
 +
 +    enerd->dvdl_lin[efptVDW] += enerd->term[F_DVDL_VDW];  /* include dispersion correction */
 +    enerd->term[F_DVDL]       = 0.0;
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        if (fepvals->separate_dvdl[i])
 +        {
 +            /* could this be done more readably/compactly? */
 +            switch (i)
 +            {
 +                case (efptMASS):
 +                    index = F_DKDL;
 +                    break;
 +                case (efptCOUL):
 +                    index = F_DVDL_COUL;
 +                    break;
 +                case (efptVDW):
 +                    index = F_DVDL_VDW;
 +                    break;
 +                case (efptBONDED):
 +                    index = F_DVDL_BONDED;
 +                    break;
 +                case (efptRESTRAINT):
 +                    index = F_DVDL_RESTRAINT;
 +                    break;
 +                default:
 +                    index = F_DVDL;
 +                    break;
 +            }
 +            enerd->term[index] = enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
 +            if (debug)
 +            {
 +                fprintf(debug, "dvdl-%s[%2d]: %f: non-linear %f + linear %f\n",
 +                        efpt_names[i], i, enerd->term[index], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
 +            }
 +        }
 +        else
 +        {
 +            enerd->term[F_DVDL] += enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
 +            if (debug)
 +            {
 +                fprintf(debug, "dvd-%sl[%2d]: %f: non-linear %f + linear %f\n",
 +                        efpt_names[0], i, enerd->term[F_DVDL], enerd->dvdl_nonlin[i], enerd->dvdl_lin[i]);
 +            }
 +        }
 +    }
 +
 +    /* Notes on the foreign lambda free energy difference evaluation:
 +     * Adding the potential and ekin terms that depend linearly on lambda
 +     * as delta lam * dvdl to the energy differences is exact.
 +     * For the constraints this is not exact, but we have no other option
 +     * without literally changing the lengths and reevaluating the energies at each step.
 +     * (try to remedy this post 4.6 - MRS)
 +     * For the non-bonded LR term we assume that the soft-core (if present)
 +     * no longer affects the energy beyond the short-range cut-off,
 +     * which is a very good approximation (except for exotic settings).
 +     * (investigate how to overcome this post 4.6 - MRS)
 +     */
++    enerd->term[F_DVDL_BONDED] += enerd->term[F_DVDL_CONSTR];
++    enerd->term[F_DVDL_CONSTR] = 0;
 +
 +    for (i = 0; i < fepvals->n_lambda; i++)
 +    {                                         /* note we are iterating over fepvals here!
 +                                                 For the current lam, dlam = 0 automatically,
 +                                                 so we don't need to add anything to the
 +                                                 enerd->enerpart_lambda[0] */
 +
 +        /* we don't need to worry about dvdl_lin contributions to dE at
 +           current lambda, because the contributions to the current
 +           lambda are automatically zeroed */
 +
 +        for (j = 0; j < efptNR; j++)
 +        {
 +            /* Note that this loop is over all dhdl components, not just the separated ones */
 +            dlam = (fepvals->all_lambda[j][i]-lambda[j]);
 +            enerd->enerpart_lambda[i+1] += dlam*enerd->dvdl_lin[j];
 +            if (debug)
 +            {
 +                fprintf(debug, "enerdiff lam %g: (%15s), non-linear %f linear %f*%f\n",
 +                        fepvals->all_lambda[j][i], efpt_names[j],
 +                        (enerd->enerpart_lambda[i+1] - enerd->enerpart_lambda[0]),
 +                        dlam, enerd->dvdl_lin[j]);
 +            }
 +        }
 +    }
 +}
 +
 +
 +void reset_foreign_enerdata(gmx_enerdata_t *enerd)
 +{
 +    int  i, j;
 +
 +    /* First reset all foreign energy components.  Foreign energies always called on
 +       neighbor search steps */
 +    for (i = 0; (i < egNR); i++)
 +    {
 +        for (j = 0; (j < enerd->grpp.nener); j++)
 +        {
 +            enerd->foreign_grpp.ener[i][j] = 0.0;
 +        }
 +    }
 +
 +    /* potential energy components */
 +    for (i = 0; (i <= F_EPOT); i++)
 +    {
 +        enerd->foreign_term[i] = 0.0;
 +    }
 +}
 +
 +void reset_enerdata(t_grpopts *opts,
 +                    t_forcerec *fr, gmx_bool bNS,
 +                    gmx_enerdata_t *enerd,
 +                    gmx_bool bMaster)
 +{
 +    gmx_bool bKeepLR;
 +    int      i, j;
 +
 +    /* First reset all energy components, except for the long range terms
 +     * on the master at non neighbor search steps, since the long range
 +     * terms have already been summed at the last neighbor search step.
 +     */
 +    bKeepLR = (fr->bTwinRange && !bNS);
 +    for (i = 0; (i < egNR); i++)
 +    {
 +        if (!(bKeepLR && bMaster && (i == egCOULLR || i == egLJLR)))
 +        {
 +            for (j = 0; (j < enerd->grpp.nener); j++)
 +            {
 +                enerd->grpp.ener[i][j] = 0.0;
 +            }
 +        }
 +    }
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        enerd->dvdl_lin[i]    = 0.0;
 +        enerd->dvdl_nonlin[i] = 0.0;
 +    }
 +
 +    /* Normal potential energy components */
 +    for (i = 0; (i <= F_EPOT); i++)
 +    {
 +        enerd->term[i] = 0.0;
 +    }
 +    /* Initialize the dVdlambda term with the long range contribution */
 +    /* Initialize the dvdl term with the long range contribution */
 +    enerd->term[F_DVDL]            = 0.0;
 +    enerd->term[F_DVDL_COUL]       = 0.0;
 +    enerd->term[F_DVDL_VDW]        = 0.0;
 +    enerd->term[F_DVDL_BONDED]     = 0.0;
 +    enerd->term[F_DVDL_RESTRAINT]  = 0.0;
 +    enerd->term[F_DKDL]            = 0.0;
 +    if (enerd->n_lambda > 0)
 +    {
 +        for (i = 0; i < enerd->n_lambda; i++)
 +        {
 +            enerd->enerpart_lambda[i] = 0.0;
 +        }
 +    }
 +    /* reset foreign energy data - separate function since we also call it elsewhere */
 +    reset_foreign_enerdata(enerd);
 +}
index 359ec5f8e538ea6b209a5dea348705fb6828e7b7,0000000000000000000000000000000000000000..cd99ec2a0f7f0859337fc889201bacba55536208
mode 100644,000000..100644
--- /dev/null
@@@ -1,2864 -1,0 +1,2864 @@@
-     real dvdlambda;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include <time.h>
 +#include <math.h>
 +#include "sysstuff.h"
 +#include "string2.h"
 +#include "network.h"
 +#include "confio.h"
 +#include "copyrite.h"
 +#include "smalloc.h"
 +#include "nrnb.h"
 +#include "main.h"
 +#include "force.h"
 +#include "macros.h"
 +#include "random.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "txtdump.h"
 +#include "typedefs.h"
 +#include "update.h"
 +#include "constr.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "tgroup.h"
 +#include "mdebin.h"
 +#include "vsite.h"
 +#include "force.h"
 +#include "mdrun.h"
 +#include "md_support.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "trnio.h"
 +#include "mdatoms.h"
 +#include "ns.h"
 +#include "gmx_wallcycle.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "pme.h"
 +#include "bondf.h"
 +#include "gmx_omp_nthreads.h"
 +
 +
 +#include "gromacs/linearalgebra/mtxio.h"
 +#include "gromacs/linearalgebra/sparsematrix.h"
 +
 +typedef struct {
 +    t_state  s;
 +    rvec    *f;
 +    real     epot;
 +    real     fnorm;
 +    real     fmax;
 +    int      a_fmax;
 +} em_state_t;
 +
 +static em_state_t *init_em_state()
 +{
 +    em_state_t *ems;
 +
 +    snew(ems, 1);
 +
 +    /* does this need to be here?  Should the array be declared differently (staticaly)in the state definition? */
 +    snew(ems->s.lambda, efptNR);
 +
 +    return ems;
 +}
 +
 +static void print_em_start(FILE *fplog, t_commrec *cr, gmx_runtime_t *runtime,
 +                           gmx_wallcycle_t wcycle,
 +                           const char *name)
 +{
 +    char buf[STRLEN];
 +
 +    runtime_start(runtime);
 +
 +    sprintf(buf, "Started %s", name);
 +    print_date_and_time(fplog, cr->nodeid, buf, NULL);
 +
 +    wallcycle_start(wcycle, ewcRUN);
 +}
 +static void em_time_end(FILE *fplog, t_commrec *cr, gmx_runtime_t *runtime,
 +                        gmx_wallcycle_t wcycle)
 +{
 +    wallcycle_stop(wcycle, ewcRUN);
 +
 +    runtime_end(runtime);
 +}
 +
 +static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
 +{
 +    fprintf(out, "\n");
 +    fprintf(out, "%s:\n", minimizer);
 +    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
 +    fprintf(out, "   Number of steps    = %12d\n", nsteps);
 +}
 +
 +static void warn_step(FILE *fp, real ftol, gmx_bool bLastStep, gmx_bool bConstrain)
 +{
 +    char buffer[2048];
 +    if (bLastStep)
 +    {
 +        sprintf(buffer,
 +                "\nEnergy minimization reached the maximum number"
 +                "of steps before the forces reached the requested"
 +                "precision Fmax < %g.\n", ftol);
 +    }
 +    else
 +    {
 +        sprintf(buffer,
 +                "\nEnergy minimization has stopped, but the forces have"
 +                "not converged to the requested precision Fmax < %g (which"
 +                "may not be possible for your system). It stopped"
 +                "because the algorithm tried to make a new step whose size"
 +                "was too small, or there was no change in the energy since"
 +                "last step. Either way, we regard the minimization as"
 +                "converged to within the available machine precision,"
 +                "given your starting configuration and EM parameters.\n%s%s",
 +                ftol,
 +                sizeof(real) < sizeof(double) ?
 +                "\nDouble precision normally gives you higher accuracy, but"
 +                "this is often not needed for preparing to run molecular"
 +                "dynamics.\n" :
 +                "",
 +                bConstrain ?
 +                "You might need to increase your constraint accuracy, or turn\n"
 +                "off constraints altogether (set constraints = none in mdp file)\n" :
 +                "");
 +    }
 +    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
 +}
 +
 +
 +
 +static void print_converged(FILE *fp, const char *alg, real ftol,
 +                            gmx_large_int_t count, gmx_bool bDone, gmx_large_int_t nsteps,
 +                            real epot, real fmax, int nfmax, real fnorm)
 +{
 +    char buf[STEPSTRSIZE];
 +
 +    if (bDone)
 +    {
 +        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
 +                alg, ftol, gmx_step_str(count, buf));
 +    }
 +    else if (count < nsteps)
 +    {
 +        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
 +                "but did not reach the requested Fmax < %g.\n",
 +                alg, gmx_step_str(count, buf), ftol);
 +    }
 +    else
 +    {
 +        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
 +                alg, ftol, gmx_step_str(count, buf));
 +    }
 +
 +#ifdef GMX_DOUBLE
 +    fprintf(fp, "Potential Energy  = %21.14e\n", epot);
 +    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", fmax, nfmax+1);
 +    fprintf(fp, "Norm of force     = %21.14e\n", fnorm);
 +#else
 +    fprintf(fp, "Potential Energy  = %14.7e\n", epot);
 +    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", fmax, nfmax+1);
 +    fprintf(fp, "Norm of force     = %14.7e\n", fnorm);
 +#endif
 +}
 +
 +static void get_f_norm_max(t_commrec *cr,
 +                           t_grpopts *opts, t_mdatoms *mdatoms, rvec *f,
 +                           real *fnorm, real *fmax, int *a_fmax)
 +{
 +    double fnorm2, *sum;
 +    real   fmax2, fmax2_0, fam;
 +    int    la_max, a_max, start, end, i, m, gf;
 +
 +    /* This routine finds the largest force and returns it.
 +     * On parallel machines the global max is taken.
 +     */
 +    fnorm2 = 0;
 +    fmax2  = 0;
 +    la_max = -1;
 +    gf     = 0;
 +    start  = mdatoms->start;
 +    end    = mdatoms->homenr + start;
 +    if (mdatoms->cFREEZE)
 +    {
 +        for (i = start; i < end; i++)
 +        {
 +            gf  = mdatoms->cFREEZE[i];
 +            fam = 0;
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (!opts->nFreeze[gf][m])
 +                {
 +                    fam += sqr(f[i][m]);
 +                }
 +            }
 +            fnorm2 += fam;
 +            if (fam > fmax2)
 +            {
 +                fmax2  = fam;
 +                la_max = i;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (i = start; i < end; i++)
 +        {
 +            fam     = norm2(f[i]);
 +            fnorm2 += fam;
 +            if (fam > fmax2)
 +            {
 +                fmax2  = fam;
 +                la_max = i;
 +            }
 +        }
 +    }
 +
 +    if (la_max >= 0 && DOMAINDECOMP(cr))
 +    {
 +        a_max = cr->dd->gatindex[la_max];
 +    }
 +    else
 +    {
 +        a_max = la_max;
 +    }
 +    if (PAR(cr))
 +    {
 +        snew(sum, 2*cr->nnodes+1);
 +        sum[2*cr->nodeid]   = fmax2;
 +        sum[2*cr->nodeid+1] = a_max;
 +        sum[2*cr->nnodes]   = fnorm2;
 +        gmx_sumd(2*cr->nnodes+1, sum, cr);
 +        fnorm2 = sum[2*cr->nnodes];
 +        /* Determine the global maximum */
 +        for (i = 0; i < cr->nnodes; i++)
 +        {
 +            if (sum[2*i] > fmax2)
 +            {
 +                fmax2 = sum[2*i];
 +                a_max = (int)(sum[2*i+1] + 0.5);
 +            }
 +        }
 +        sfree(sum);
 +    }
 +
 +    if (fnorm)
 +    {
 +        *fnorm = sqrt(fnorm2);
 +    }
 +    if (fmax)
 +    {
 +        *fmax  = sqrt(fmax2);
 +    }
 +    if (a_fmax)
 +    {
 +        *a_fmax = a_max;
 +    }
 +}
 +
 +static void get_state_f_norm_max(t_commrec *cr,
 +                                 t_grpopts *opts, t_mdatoms *mdatoms,
 +                                 em_state_t *ems)
 +{
 +    get_f_norm_max(cr, opts, mdatoms, ems->f, &ems->fnorm, &ems->fmax, &ems->a_fmax);
 +}
 +
 +void init_em(FILE *fplog, const char *title,
 +             t_commrec *cr, t_inputrec *ir,
 +             t_state *state_global, gmx_mtop_t *top_global,
 +             em_state_t *ems, gmx_localtop_t **top,
 +             rvec **f, rvec **f_global,
 +             t_nrnb *nrnb, rvec mu_tot,
 +             t_forcerec *fr, gmx_enerdata_t **enerd,
 +             t_graph **graph, t_mdatoms *mdatoms, gmx_global_stat_t *gstat,
 +             gmx_vsite_t *vsite, gmx_constr_t constr,
 +             int nfile, const t_filenm fnm[],
 +             gmx_mdoutf_t **outf, t_mdebin **mdebin)
 +{
 +    int  start, homenr, i;
-             dvdlambda = 0;
++    real dvdl_constr;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Initiating %s\n", title);
 +    }
 +
 +    state_global->ngtc = 0;
 +
 +    /* Initialize lambda variables */
 +    initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, NULL);
 +
 +    init_nrnb(nrnb);
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        *top = dd_init_local_top(top_global);
 +
 +        dd_init_local_state(cr->dd, state_global, &ems->s);
 +
 +        *f = NULL;
 +
 +        /* Distribute the charge groups over the nodes from the master node */
 +        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
 +                            state_global, top_global, ir,
 +                            &ems->s, &ems->f, mdatoms, *top,
 +                            fr, vsite, NULL, constr,
 +                            nrnb, NULL, FALSE);
 +        dd_store_state(cr->dd, &ems->s);
 +
 +        if (ir->nstfout)
 +        {
 +            snew(*f_global, top_global->natoms);
 +        }
 +        else
 +        {
 +            *f_global = NULL;
 +        }
 +        *graph = NULL;
 +    }
 +    else
 +    {
 +        snew(*f, top_global->natoms);
 +
 +        /* Just copy the state */
 +        ems->s = *state_global;
 +        snew(ems->s.x, ems->s.nalloc);
 +        snew(ems->f, ems->s.nalloc);
 +        for (i = 0; i < state_global->natoms; i++)
 +        {
 +            copy_rvec(state_global->x[i], ems->s.x[i]);
 +        }
 +        copy_mat(state_global->box, ems->s.box);
 +
 +        if (PAR(cr) && ir->eI != eiNM)
 +        {
 +            /* Initialize the particle decomposition and split the topology */
 +            *top = split_system(fplog, top_global, ir, cr);
 +
 +            pd_cg_range(cr, &fr->cg0, &fr->hcg);
 +        }
 +        else
 +        {
 +            *top = gmx_mtop_generate_local_top(top_global, ir);
 +        }
 +        *f_global = *f;
 +
 +        forcerec_set_excl_load(fr, *top, cr);
 +
 +        init_bonded_thread_force_reduction(fr, &(*top)->idef);
 +
 +        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
 +        {
 +            *graph = mk_graph(fplog, &((*top)->idef), 0, top_global->natoms, FALSE, FALSE);
 +        }
 +        else
 +        {
 +            *graph = NULL;
 +        }
 +
 +        if (PARTDECOMP(cr))
 +        {
 +            pd_at_range(cr, &start, &homenr);
 +            homenr -= start;
 +        }
 +        else
 +        {
 +            start  = 0;
 +            homenr = top_global->natoms;
 +        }
 +        atoms2md(top_global, ir, 0, NULL, start, homenr, mdatoms);
 +        update_mdatoms(mdatoms, state_global->lambda[efptFEP]);
 +
 +        if (vsite)
 +        {
 +            set_vsite_top(vsite, *top, mdatoms, cr);
 +        }
 +    }
 +
 +    if (constr)
 +    {
 +        if (ir->eConstrAlg == econtSHAKE &&
 +            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
 +        {
 +            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
 +                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
 +        }
 +
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            set_constraints(constr, *top, ir, mdatoms, cr);
 +        }
 +
 +        if (!ir->bContinuation)
 +        {
 +            /* Constrain the starting coordinates */
-                       ems->s.lambda[efptFEP], &dvdlambda,
++            dvdl_constr = 0;
 +            constrain(PAR(cr) ? NULL : fplog, TRUE, TRUE, constr, &(*top)->idef,
 +                      ir, NULL, cr, -1, 0, mdatoms,
 +                      ems->s.x, ems->s.x, NULL, fr->bMolPBC, ems->s.box,
-     real     dvdlambda;
++                      ems->s.lambda[efptFEP], &dvdl_constr,
 +                      NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
 +        }
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        *gstat = global_stat_init(ir);
 +    }
 +
 +    *outf = init_mdoutf(nfile, fnm, 0, cr, ir, NULL);
 +
 +    snew(*enerd, 1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
 +                  *enerd);
 +
 +    if (mdebin != NULL)
 +    {
 +        /* Init bin for energy stuff */
 +        *mdebin = init_mdebin((*outf)->fp_ene, top_global, ir, NULL);
 +    }
 +
 +    clear_rvec(mu_tot);
 +    calc_shifts(ems->s.box, fr->shift_vec);
 +}
 +
 +static void finish_em(FILE *fplog, t_commrec *cr, gmx_mdoutf_t *outf,
 +                      gmx_runtime_t *runtime, gmx_wallcycle_t wcycle)
 +{
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Tell the PME only node to finish */
 +        gmx_pme_send_finish(cr);
 +    }
 +
 +    done_mdoutf(outf);
 +
 +    em_time_end(fplog, cr, runtime, wcycle);
 +}
 +
 +static void swap_em_state(em_state_t *ems1, em_state_t *ems2)
 +{
 +    em_state_t tmp;
 +
 +    tmp   = *ems1;
 +    *ems1 = *ems2;
 +    *ems2 = tmp;
 +}
 +
 +static void copy_em_coords(em_state_t *ems, t_state *state)
 +{
 +    int i;
 +
 +    for (i = 0; (i < state->natoms); i++)
 +    {
 +        copy_rvec(ems->s.x[i], state->x[i]);
 +    }
 +}
 +
 +static void write_em_traj(FILE *fplog, t_commrec *cr,
 +                          gmx_mdoutf_t *outf,
 +                          gmx_bool bX, gmx_bool bF, const char *confout,
 +                          gmx_mtop_t *top_global,
 +                          t_inputrec *ir, gmx_large_int_t step,
 +                          em_state_t *state,
 +                          t_state *state_global, rvec *f_global)
 +{
 +    int mdof_flags;
 +
 +    if ((bX || bF || confout != NULL) && !DOMAINDECOMP(cr))
 +    {
 +        copy_em_coords(state, state_global);
 +        f_global = state->f;
 +    }
 +
 +    mdof_flags = 0;
 +    if (bX)
 +    {
 +        mdof_flags |= MDOF_X;
 +    }
 +    if (bF)
 +    {
 +        mdof_flags |= MDOF_F;
 +    }
 +    write_traj(fplog, cr, outf, mdof_flags,
 +               top_global, step, (double)step,
 +               &state->s, state_global, state->f, f_global, NULL, NULL);
 +
 +    if (confout != NULL && MASTER(cr))
 +    {
 +        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
 +        {
 +            /* Make molecules whole only for confout writing */
 +            do_pbc_mtop(fplog, ir->ePBC, state_global->box, top_global,
 +                        state_global->x);
 +        }
 +
 +        write_sto_conf_mtop(confout,
 +                            *top_global->name, top_global,
 +                            state_global->x, NULL, ir->ePBC, state_global->box);
 +    }
 +}
 +
 +static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
 +                       gmx_bool bMolPBC,
 +                       em_state_t *ems1, real a, rvec *f, em_state_t *ems2,
 +                       gmx_constr_t constr, gmx_localtop_t *top,
 +                       t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +                       gmx_large_int_t count)
 +
 +{
 +    t_state *s1, *s2;
 +    int      i;
 +    int      start, end;
 +    rvec    *x1, *x2;
-         dvdlambda = 0;
++    real     dvdl_constr;
 +
 +    s1 = &ems1->s;
 +    s2 = &ems2->s;
 +
 +    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
 +    {
 +        gmx_incons("state mismatch in do_em_step");
 +    }
 +
 +    s2->flags = s1->flags;
 +
 +    if (s2->nalloc != s1->nalloc)
 +    {
 +        s2->nalloc = s1->nalloc;
 +        srenew(s2->x, s1->nalloc);
 +        srenew(ems2->f,  s1->nalloc);
 +        if (s2->flags & (1<<estCGP))
 +        {
 +            srenew(s2->cg_p,  s1->nalloc);
 +        }
 +    }
 +
 +    s2->natoms = s1->natoms;
 +    copy_mat(s1->box, s2->box);
 +    /* Copy free energy state */
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        s2->lambda[i] = s1->lambda[i];
 +    }
 +    copy_mat(s1->box, s2->box);
 +
 +    start = md->start;
 +    end   = md->start + md->homenr;
 +
 +    x1 = s1->x;
 +    x2 = s2->x;
 +
 +#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
 +    {
 +        int gf, i, m;
 +
 +        gf = 0;
 +#pragma omp for schedule(static) nowait
 +        for (i = start; i < end; i++)
 +        {
 +            if (md->cFREEZE)
 +            {
 +                gf = md->cFREEZE[i];
 +            }
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (ir->opts.nFreeze[gf][m])
 +                {
 +                    x2[i][m] = x1[i][m];
 +                }
 +                else
 +                {
 +                    x2[i][m] = x1[i][m] + a*f[i][m];
 +                }
 +            }
 +        }
 +
 +        if (s2->flags & (1<<estCGP))
 +        {
 +            /* Copy the CG p vector */
 +            x1 = s1->cg_p;
 +            x2 = s2->cg_p;
 +#pragma omp for schedule(static) nowait
 +            for (i = start; i < end; i++)
 +            {
 +                copy_rvec(x1[i], x2[i]);
 +            }
 +        }
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            s2->ddp_count = s1->ddp_count;
 +            if (s2->cg_gl_nalloc < s1->cg_gl_nalloc)
 +            {
 +#pragma omp barrier
 +                s2->cg_gl_nalloc = s1->cg_gl_nalloc;
 +                srenew(s2->cg_gl, s2->cg_gl_nalloc);
 +#pragma omp barrier
 +            }
 +            s2->ncg_gl = s1->ncg_gl;
 +#pragma omp for schedule(static) nowait
 +            for (i = 0; i < s2->ncg_gl; i++)
 +            {
 +                s2->cg_gl[i] = s1->cg_gl[i];
 +            }
 +            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
 +        }
 +    }
 +
 +    if (constr)
 +    {
 +        wallcycle_start(wcycle, ewcCONSTR);
-                   s2->lambda[efptBONDED], &dvdlambda,
++        dvdl_constr = 0;
 +        constrain(NULL, TRUE, TRUE, constr, &top->idef,
 +                  ir, NULL, cr, count, 0, md,
 +                  s1->x, s2->x, NULL, bMolPBC, s2->box,
-     real     dvdlambda, prescorr, enercorr, dvdlcorr;
++                  s2->lambda[efptBONDED], &dvdl_constr,
 +                  NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
 +        wallcycle_stop(wcycle, ewcCONSTR);
 +    }
 +}
 +
 +static void em_dd_partition_system(FILE *fplog, int step, t_commrec *cr,
 +                                   gmx_mtop_t *top_global, t_inputrec *ir,
 +                                   em_state_t *ems, gmx_localtop_t *top,
 +                                   t_mdatoms *mdatoms, t_forcerec *fr,
 +                                   gmx_vsite_t *vsite, gmx_constr_t constr,
 +                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
 +{
 +    /* Repartition the domain decomposition */
 +    wallcycle_start(wcycle, ewcDOMDEC);
 +    dd_partition_system(fplog, step, cr, FALSE, 1,
 +                        NULL, top_global, ir,
 +                        &ems->s, &ems->f,
 +                        mdatoms, top, fr, vsite, NULL, constr,
 +                        nrnb, wcycle, FALSE);
 +    dd_store_state(cr->dd, &ems->s);
 +    wallcycle_stop(wcycle, ewcDOMDEC);
 +}
 +
 +static void evaluate_energy(FILE *fplog, gmx_bool bVerbose, t_commrec *cr,
 +                            t_state *state_global, gmx_mtop_t *top_global,
 +                            em_state_t *ems, gmx_localtop_t *top,
 +                            t_inputrec *inputrec,
 +                            t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +                            gmx_global_stat_t gstat,
 +                            gmx_vsite_t *vsite, gmx_constr_t constr,
 +                            t_fcdata *fcd,
 +                            t_graph *graph, t_mdatoms *mdatoms,
 +                            t_forcerec *fr, rvec mu_tot,
 +                            gmx_enerdata_t *enerd, tensor vir, tensor pres,
 +                            gmx_large_int_t count, gmx_bool bFirst)
 +{
 +    real     t;
 +    gmx_bool bNS;
 +    int      nabnsb;
 +    tensor   force_vir, shake_vir, ekin;
-         dvdlambda = 0;
++    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
 +    real     terminate = 0;
 +
 +    /* Set the time to the initial time, the time does not change during EM */
 +    t = inputrec->init_t;
 +
 +    if (bFirst ||
 +        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
 +    {
 +        /* This the first state or an old state used before the last ns */
 +        bNS = TRUE;
 +    }
 +    else
 +    {
 +        bNS = FALSE;
 +        if (inputrec->nstlist > 0)
 +        {
 +            bNS = TRUE;
 +        }
 +        else if (inputrec->nstlist == -1)
 +        {
 +            nabnsb = natoms_beyond_ns_buffer(inputrec, fr, &top->cgs, NULL, ems->s.x);
 +            if (PAR(cr))
 +            {
 +                gmx_sumi(1, &nabnsb, cr);
 +            }
 +            bNS = (nabnsb > 0);
 +        }
 +    }
 +
 +    if (vsite)
 +    {
 +        construct_vsites(fplog, vsite, ems->s.x, nrnb, 1, NULL,
 +                         top->idef.iparams, top->idef.il,
 +                         fr->ePBC, fr->bMolPBC, graph, cr, ems->s.box);
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        if (bNS)
 +        {
 +            /* Repartition the domain decomposition */
 +            em_dd_partition_system(fplog, count, cr, top_global, inputrec,
 +                                   ems, top, mdatoms, fr, vsite, constr,
 +                                   nrnb, wcycle);
 +        }
 +    }
 +
 +    /* Calc force & energy on new trial position  */
 +    /* do_force always puts the charge groups in the box and shifts again
 +     * We do not unshift, so molecules are always whole in congrad.c
 +     */
 +    do_force(fplog, cr, inputrec,
 +             count, nrnb, wcycle, top, top_global, &top_global->groups,
 +             ems->s.box, ems->s.x, &ems->s.hist,
 +             ems->f, force_vir, mdatoms, enerd, fcd,
 +             ems->s.lambda, graph, fr, vsite, mu_tot, t, NULL, NULL, TRUE,
 +             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
 +             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
 +             (bNS ? GMX_FORCE_NS | GMX_FORCE_DO_LR : 0));
 +
 +    /* Clear the unused shake virial and pressure */
 +    clear_mat(shake_vir);
 +    clear_mat(pres);
 +
 +    /* Communicate stuff when parallel */
 +    if (PAR(cr) && inputrec->eI != eiNM)
 +    {
 +        wallcycle_start(wcycle, ewcMoveE);
 +
 +        global_stat(fplog, gstat, cr, enerd, force_vir, shake_vir, mu_tot,
 +                    inputrec, NULL, NULL, NULL, 1, &terminate,
 +                    top_global, &ems->s, FALSE,
 +                    CGLO_ENERGY |
 +                    CGLO_PRESSURE |
 +                    CGLO_CONSTRAINT |
 +                    CGLO_FIRSTITERATE);
 +
 +        wallcycle_stop(wcycle, ewcMoveE);
 +    }
 +
 +    /* Calculate long range corrections to pressure and energy */
 +    calc_dispcorr(fplog, inputrec, fr, count, top_global->natoms, ems->s.box, ems->s.lambda[efptVDW],
 +                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
 +    enerd->term[F_DISPCORR] = enercorr;
 +    enerd->term[F_EPOT]    += enercorr;
 +    enerd->term[F_PRES]    += prescorr;
 +    enerd->term[F_DVDL]    += dvdlcorr;
 +
 +    ems->epot = enerd->term[F_EPOT];
 +
 +    if (constr)
 +    {
 +        /* Project out the constraint components of the force */
 +        wallcycle_start(wcycle, ewcCONSTR);
-                   ems->s.lambda[efptBONDED], &dvdlambda,
++        dvdl_constr = 0;
 +        constrain(NULL, FALSE, FALSE, constr, &top->idef,
 +                  inputrec, NULL, cr, count, 0, mdatoms,
 +                  ems->s.x, ems->f, ems->f, fr->bMolPBC, ems->s.box,
-             fprintf(fplog, sepdvdlformat, "Constraints", t, dvdlambda);
++                  ems->s.lambda[efptBONDED], &dvdl_constr,
 +                  NULL, &shake_vir, nrnb, econqForceDispl, FALSE, 0, 0);
 +        if (fr->bSepDVDL && fplog)
 +        {
-         enerd->term[F_DVDL_BONDED] += dvdlambda;
++            fprintf(fplog, sepdvdlformat, "Constraints", t, dvdl_constr);
 +        }
-     real              ustep, dvdlambda, fnormn;
++        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
 +        m_add(force_vir, shake_vir, vir);
 +        wallcycle_stop(wcycle, ewcCONSTR);
 +    }
 +    else
 +    {
 +        copy_mat(force_vir, vir);
 +    }
 +
 +    clear_mat(ekin);
 +    enerd->term[F_PRES] =
 +        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
 +
 +    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
 +
 +    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
 +    {
 +        get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, ems);
 +    }
 +}
 +
 +static double reorder_partsum(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
 +                              gmx_mtop_t *mtop,
 +                              em_state_t *s_min, em_state_t *s_b)
 +{
 +    rvec          *fm, *fb, *fmg;
 +    t_block       *cgs_gl;
 +    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
 +    double         partsum;
 +    unsigned char *grpnrFREEZE;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Doing reorder_partsum\n");
 +    }
 +
 +    fm = s_min->f;
 +    fb = s_b->f;
 +
 +    cgs_gl = dd_charge_groups_global(cr->dd);
 +    index  = cgs_gl->index;
 +
 +    /* Collect fm in a global vector fmg.
 +     * This conflicts with the spirit of domain decomposition,
 +     * but to fully optimize this a much more complicated algorithm is required.
 +     */
 +    snew(fmg, mtop->natoms);
 +
 +    ncg   = s_min->s.ncg_gl;
 +    cg_gl = s_min->s.cg_gl;
 +    i     = 0;
 +    for (c = 0; c < ncg; c++)
 +    {
 +        cg = cg_gl[c];
 +        a0 = index[cg];
 +        a1 = index[cg+1];
 +        for (a = a0; a < a1; a++)
 +        {
 +            copy_rvec(fm[i], fmg[a]);
 +            i++;
 +        }
 +    }
 +    gmx_sum(mtop->natoms*3, fmg[0], cr);
 +
 +    /* Now we will determine the part of the sum for the cgs in state s_b */
 +    ncg         = s_b->s.ncg_gl;
 +    cg_gl       = s_b->s.cg_gl;
 +    partsum     = 0;
 +    i           = 0;
 +    gf          = 0;
 +    grpnrFREEZE = mtop->groups.grpnr[egcFREEZE];
 +    for (c = 0; c < ncg; c++)
 +    {
 +        cg = cg_gl[c];
 +        a0 = index[cg];
 +        a1 = index[cg+1];
 +        for (a = a0; a < a1; a++)
 +        {
 +            if (mdatoms->cFREEZE && grpnrFREEZE)
 +            {
 +                gf = grpnrFREEZE[i];
 +            }
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (!opts->nFreeze[gf][m])
 +                {
 +                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
 +                }
 +            }
 +            i++;
 +        }
 +    }
 +
 +    sfree(fmg);
 +
 +    return partsum;
 +}
 +
 +static real pr_beta(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
 +                    gmx_mtop_t *mtop,
 +                    em_state_t *s_min, em_state_t *s_b)
 +{
 +    rvec  *fm, *fb;
 +    double sum;
 +    int    gf, i, m;
 +
 +    /* This is just the classical Polak-Ribiere calculation of beta;
 +     * it looks a bit complicated since we take freeze groups into account,
 +     * and might have to sum it in parallel runs.
 +     */
 +
 +    if (!DOMAINDECOMP(cr) ||
 +        (s_min->s.ddp_count == cr->dd->ddp_count &&
 +         s_b->s.ddp_count   == cr->dd->ddp_count))
 +    {
 +        fm  = s_min->f;
 +        fb  = s_b->f;
 +        sum = 0;
 +        gf  = 0;
 +        /* This part of code can be incorrect with DD,
 +         * since the atom ordering in s_b and s_min might differ.
 +         */
 +        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +        {
 +            if (mdatoms->cFREEZE)
 +            {
 +                gf = mdatoms->cFREEZE[i];
 +            }
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (!opts->nFreeze[gf][m])
 +                {
 +                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* We need to reorder cgs while summing */
 +        sum = reorder_partsum(cr, opts, mdatoms, mtop, s_min, s_b);
 +    }
 +    if (PAR(cr))
 +    {
 +        gmx_sumd(1, &sum, cr);
 +    }
 +
 +    return sum/sqr(s_min->fnorm);
 +}
 +
 +double do_cg(FILE *fplog, t_commrec *cr,
 +             int nfile, const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite, gmx_constr_t constr,
 +             int stepout,
 +             t_inputrec *inputrec,
 +             gmx_mtop_t *top_global, t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed,
 +             t_forcerec *fr,
 +             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
 +             gmx_membed_t membed,
 +             real cpt_period, real max_hours,
 +             const char *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    const char       *CG = "Polak-Ribiere Conjugate Gradients";
 +
 +    em_state_t       *s_min, *s_a, *s_b, *s_c;
 +    gmx_localtop_t   *top;
 +    gmx_enerdata_t   *enerd;
 +    rvec             *f;
 +    gmx_global_stat_t gstat;
 +    t_graph          *graph;
 +    rvec             *f_global, *p, *sf, *sfm;
 +    double            gpa, gpb, gpc, tmp, sum[2], minstep;
 +    real              fnormn;
 +    real              stepsize;
 +    real              a, b, c, beta = 0.0;
 +    real              epot_repl = 0;
 +    real              pnorm;
 +    t_mdebin         *mdebin;
 +    gmx_bool          converged, foundlower;
 +    rvec              mu_tot;
 +    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
 +    tensor            vir, pres;
 +    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
 +    gmx_mdoutf_t     *outf;
 +    int               i, m, gf, step, nminstep;
 +    real              terminate = 0;
 +
 +    step = 0;
 +
 +    s_min = init_em_state();
 +    s_a   = init_em_state();
 +    s_b   = init_em_state();
 +    s_c   = init_em_state();
 +
 +    /* Init em and store the local state in s_min */
 +    init_em(fplog, CG, cr, inputrec,
 +            state_global, top_global, s_min, &top, &f, &f_global,
 +            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
 +            nfile, fnm, &outf, &mdebin);
 +
 +    /* Print to log file */
 +    print_em_start(fplog, cr, runtime, wcycle, CG);
 +
 +    /* Max number of steps */
 +    number_steps = inputrec->nsteps;
 +
 +    if (MASTER(cr))
 +    {
 +        sp_header(stderr, CG, inputrec->em_tol, number_steps);
 +    }
 +    if (fplog)
 +    {
 +        sp_header(fplog, CG, inputrec->em_tol, number_steps);
 +    }
 +
 +    /* Call the force routine and some auxiliary (neighboursearching etc.) */
 +    /* do_force always puts the charge groups in the box and shifts again
 +     * We do not unshift, so molecules are always whole in congrad.c
 +     */
 +    evaluate_energy(fplog, bVerbose, cr,
 +                    state_global, top_global, s_min, top,
 +                    inputrec, nrnb, wcycle, gstat,
 +                    vsite, constr, fcd, graph, mdatoms, fr,
 +                    mu_tot, enerd, vir, pres, -1, TRUE);
 +    where();
 +
 +    if (MASTER(cr))
 +    {
 +        /* Copy stuff to the energy bin for easy printing etc. */
 +        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
 +                   mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
 +                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +
 +        print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
 +        print_ebin(outf->fp_ene, TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
 +                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +    }
 +    where();
 +
 +    /* Estimate/guess the initial stepsize */
 +    stepsize = inputrec->em_stepsize/s_min->fnorm;
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
 +                s_min->fmax, s_min->a_fmax+1);
 +        fprintf(stderr, "   F-Norm            = %12.5e\n",
 +                s_min->fnorm/sqrt(state_global->natoms));
 +        fprintf(stderr, "\n");
 +        /* and copy to the log file too... */
 +        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
 +                s_min->fmax, s_min->a_fmax+1);
 +        fprintf(fplog, "   F-Norm            = %12.5e\n",
 +                s_min->fnorm/sqrt(state_global->natoms));
 +        fprintf(fplog, "\n");
 +    }
 +    /* Start the loop over CG steps.
 +     * Each successful step is counted, and we continue until
 +     * we either converge or reach the max number of steps.
 +     */
 +    converged = FALSE;
 +    for (step = 0; (number_steps < 0 || (number_steps >= 0 && step <= number_steps)) && !converged; step++)
 +    {
 +
 +        /* start taking steps in a new direction
 +         * First time we enter the routine, beta=0, and the direction is
 +         * simply the negative gradient.
 +         */
 +
 +        /* Calculate the new direction in p, and the gradient in this direction, gpa */
 +        p   = s_min->s.cg_p;
 +        sf  = s_min->f;
 +        gpa = 0;
 +        gf  = 0;
 +        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +        {
 +            if (mdatoms->cFREEZE)
 +            {
 +                gf = mdatoms->cFREEZE[i];
 +            }
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (!inputrec->opts.nFreeze[gf][m])
 +                {
 +                    p[i][m] = sf[i][m] + beta*p[i][m];
 +                    gpa    -= p[i][m]*sf[i][m];
 +                    /* f is negative gradient, thus the sign */
 +                }
 +                else
 +                {
 +                    p[i][m] = 0;
 +                }
 +            }
 +        }
 +
 +        /* Sum the gradient along the line across CPUs */
 +        if (PAR(cr))
 +        {
 +            gmx_sumd(1, &gpa, cr);
 +        }
 +
 +        /* Calculate the norm of the search vector */
 +        get_f_norm_max(cr, &(inputrec->opts), mdatoms, p, &pnorm, NULL, NULL);
 +
 +        /* Just in case stepsize reaches zero due to numerical precision... */
 +        if (stepsize <= 0)
 +        {
 +            stepsize = inputrec->em_stepsize/pnorm;
 +        }
 +
 +        /*
 +         * Double check the value of the derivative in the search direction.
 +         * If it is positive it must be due to the old information in the
 +         * CG formula, so just remove that and start over with beta=0.
 +         * This corresponds to a steepest descent step.
 +         */
 +        if (gpa > 0)
 +        {
 +            beta = 0;
 +            step--;   /* Don't count this step since we are restarting */
 +            continue; /* Go back to the beginning of the big for-loop */
 +        }
 +
 +        /* Calculate minimum allowed stepsize, before the average (norm)
 +         * relative change in coordinate is smaller than precision
 +         */
 +        minstep = 0;
 +        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +        {
 +            for (m = 0; m < DIM; m++)
 +            {
 +                tmp = fabs(s_min->s.x[i][m]);
 +                if (tmp < 1.0)
 +                {
 +                    tmp = 1.0;
 +                }
 +                tmp      = p[i][m]/tmp;
 +                minstep += tmp*tmp;
 +            }
 +        }
 +        /* Add up from all CPUs */
 +        if (PAR(cr))
 +        {
 +            gmx_sumd(1, &minstep, cr);
 +        }
 +
 +        minstep = GMX_REAL_EPS/sqrt(minstep/(3*state_global->natoms));
 +
 +        if (stepsize < minstep)
 +        {
 +            converged = TRUE;
 +            break;
 +        }
 +
 +        /* Write coordinates if necessary */
 +        do_x = do_per_step(step, inputrec->nstxout);
 +        do_f = do_per_step(step, inputrec->nstfout);
 +
 +        write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
 +                      top_global, inputrec, step,
 +                      s_min, state_global, f_global);
 +
 +        /* Take a step downhill.
 +         * In theory, we should minimize the function along this direction.
 +         * That is quite possible, but it turns out to take 5-10 function evaluations
 +         * for each line. However, we dont really need to find the exact minimum -
 +         * it is much better to start a new CG step in a modified direction as soon
 +         * as we are close to it. This will save a lot of energy evaluations.
 +         *
 +         * In practice, we just try to take a single step.
 +         * If it worked (i.e. lowered the energy), we increase the stepsize but
 +         * the continue straight to the next CG step without trying to find any minimum.
 +         * If it didn't work (higher energy), there must be a minimum somewhere between
 +         * the old position and the new one.
 +         *
 +         * Due to the finite numerical accuracy, it turns out that it is a good idea
 +         * to even accept a SMALL increase in energy, if the derivative is still downhill.
 +         * This leads to lower final energies in the tests I've done. / Erik
 +         */
 +        s_a->epot = s_min->epot;
 +        a         = 0.0;
 +        c         = a + stepsize; /* reference position along line is zero */
 +
 +        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
 +        {
 +            em_dd_partition_system(fplog, step, cr, top_global, inputrec,
 +                                   s_min, top, mdatoms, fr, vsite, constr,
 +                                   nrnb, wcycle);
 +        }
 +
 +        /* Take a trial step (new coords in s_c) */
 +        do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, c, s_min->s.cg_p, s_c,
 +                   constr, top, nrnb, wcycle, -1);
 +
 +        neval++;
 +        /* Calculate energy for the trial step */
 +        evaluate_energy(fplog, bVerbose, cr,
 +                        state_global, top_global, s_c, top,
 +                        inputrec, nrnb, wcycle, gstat,
 +                        vsite, constr, fcd, graph, mdatoms, fr,
 +                        mu_tot, enerd, vir, pres, -1, FALSE);
 +
 +        /* Calc derivative along line */
 +        p   = s_c->s.cg_p;
 +        sf  = s_c->f;
 +        gpc = 0;
 +        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +        {
 +            for (m = 0; m < DIM; m++)
 +            {
 +                gpc -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
 +            }
 +        }
 +        /* Sum the gradient along the line across CPUs */
 +        if (PAR(cr))
 +        {
 +            gmx_sumd(1, &gpc, cr);
 +        }
 +
 +        /* This is the max amount of increase in energy we tolerate */
 +        tmp = sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
 +
 +        /* Accept the step if the energy is lower, or if it is not significantly higher
 +         * and the line derivative is still negative.
 +         */
 +        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
 +        {
 +            foundlower = TRUE;
 +            /* Great, we found a better energy. Increase step for next iteration
 +             * if we are still going down, decrease it otherwise
 +             */
 +            if (gpc < 0)
 +            {
 +                stepsize *= 1.618034; /* The golden section */
 +            }
 +            else
 +            {
 +                stepsize *= 0.618034; /* 1/golden section */
 +            }
 +        }
 +        else
 +        {
 +            /* New energy is the same or higher. We will have to do some work
 +             * to find a smaller value in the interval. Take smaller step next time!
 +             */
 +            foundlower = FALSE;
 +            stepsize  *= 0.618034;
 +        }
 +
 +
 +
 +
 +        /* OK, if we didn't find a lower value we will have to locate one now - there must
 +         * be one in the interval [a=0,c].
 +         * The same thing is valid here, though: Don't spend dozens of iterations to find
 +         * the line minimum. We try to interpolate based on the derivative at the endpoints,
 +         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
 +         *
 +         * I also have a safeguard for potentially really patological functions so we never
 +         * take more than 20 steps before we give up ...
 +         *
 +         * If we already found a lower value we just skip this step and continue to the update.
 +         */
 +        if (!foundlower)
 +        {
 +            nminstep = 0;
 +
 +            do
 +            {
 +                /* Select a new trial point.
 +                 * If the derivatives at points a & c have different sign we interpolate to zero,
 +                 * otherwise just do a bisection.
 +                 */
 +                if (gpa < 0 && gpc > 0)
 +                {
 +                    b = a + gpa*(a-c)/(gpc-gpa);
 +                }
 +                else
 +                {
 +                    b = 0.5*(a+c);
 +                }
 +
 +                /* safeguard if interpolation close to machine accuracy causes errors:
 +                 * never go outside the interval
 +                 */
 +                if (b <= a || b >= c)
 +                {
 +                    b = 0.5*(a+c);
 +                }
 +
 +                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
 +                {
 +                    /* Reload the old state */
 +                    em_dd_partition_system(fplog, -1, cr, top_global, inputrec,
 +                                           s_min, top, mdatoms, fr, vsite, constr,
 +                                           nrnb, wcycle);
 +                }
 +
 +                /* Take a trial step to this new point - new coords in s_b */
 +                do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, b, s_min->s.cg_p, s_b,
 +                           constr, top, nrnb, wcycle, -1);
 +
 +                neval++;
 +                /* Calculate energy for the trial step */
 +                evaluate_energy(fplog, bVerbose, cr,
 +                                state_global, top_global, s_b, top,
 +                                inputrec, nrnb, wcycle, gstat,
 +                                vsite, constr, fcd, graph, mdatoms, fr,
 +                                mu_tot, enerd, vir, pres, -1, FALSE);
 +
 +                /* p does not change within a step, but since the domain decomposition
 +                 * might change, we have to use cg_p of s_b here.
 +                 */
 +                p   = s_b->s.cg_p;
 +                sf  = s_b->f;
 +                gpb = 0;
 +                for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +                {
 +                    for (m = 0; m < DIM; m++)
 +                    {
 +                        gpb -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
 +                    }
 +                }
 +                /* Sum the gradient along the line across CPUs */
 +                if (PAR(cr))
 +                {
 +                    gmx_sumd(1, &gpb, cr);
 +                }
 +
 +                if (debug)
 +                {
 +                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
 +                            s_a->epot, s_b->epot, s_c->epot, gpb);
 +                }
 +
 +                epot_repl = s_b->epot;
 +
 +                /* Keep one of the intervals based on the value of the derivative at the new point */
 +                if (gpb > 0)
 +                {
 +                    /* Replace c endpoint with b */
 +                    swap_em_state(s_b, s_c);
 +                    c   = b;
 +                    gpc = gpb;
 +                }
 +                else
 +                {
 +                    /* Replace a endpoint with b */
 +                    swap_em_state(s_b, s_a);
 +                    a   = b;
 +                    gpa = gpb;
 +                }
 +
 +                /*
 +                 * Stop search as soon as we find a value smaller than the endpoints.
 +                 * Never run more than 20 steps, no matter what.
 +                 */
 +                nminstep++;
 +            }
 +            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
 +                   (nminstep < 20));
 +
 +            if (fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
 +                nminstep >= 20)
 +            {
 +                /* OK. We couldn't find a significantly lower energy.
 +                 * If beta==0 this was steepest descent, and then we give up.
 +                 * If not, set beta=0 and restart with steepest descent before quitting.
 +                 */
 +                if (beta == 0.0)
 +                {
 +                    /* Converged */
 +                    converged = TRUE;
 +                    break;
 +                }
 +                else
 +                {
 +                    /* Reset memory before giving up */
 +                    beta = 0.0;
 +                    continue;
 +                }
 +            }
 +
 +            /* Select min energy state of A & C, put the best in B.
 +             */
 +            if (s_c->epot < s_a->epot)
 +            {
 +                if (debug)
 +                {
 +                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
 +                            s_c->epot, s_a->epot);
 +                }
 +                swap_em_state(s_b, s_c);
 +                gpb = gpc;
 +                b   = c;
 +            }
 +            else
 +            {
 +                if (debug)
 +                {
 +                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
 +                            s_a->epot, s_c->epot);
 +                }
 +                swap_em_state(s_b, s_a);
 +                gpb = gpa;
 +                b   = a;
 +            }
 +
 +        }
 +        else
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
 +                        s_c->epot);
 +            }
 +            swap_em_state(s_b, s_c);
 +            gpb = gpc;
 +            b   = c;
 +        }
 +
 +        /* new search direction */
 +        /* beta = 0 means forget all memory and restart with steepest descents. */
 +        if (nstcg && ((step % nstcg) == 0))
 +        {
 +            beta = 0.0;
 +        }
 +        else
 +        {
 +            /* s_min->fnorm cannot be zero, because then we would have converged
 +             * and broken out.
 +             */
 +
 +            /* Polak-Ribiere update.
 +             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
 +             */
 +            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
 +        }
 +        /* Limit beta to prevent oscillations */
 +        if (fabs(beta) > 5.0)
 +        {
 +            beta = 0.0;
 +        }
 +
 +
 +        /* update positions */
 +        swap_em_state(s_min, s_b);
 +        gpa = gpb;
 +
 +        /* Print it if necessary */
 +        if (MASTER(cr))
 +        {
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
 +                        step, s_min->epot, s_min->fnorm/sqrt(state_global->natoms),
 +                        s_min->fmax, s_min->a_fmax+1);
 +            }
 +            /* Store the new (lower) energies */
 +            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
 +                       mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
 +                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +
 +            do_log = do_per_step(step, inputrec->nstlog);
 +            do_ene = do_per_step(step, inputrec->nstenergy);
 +            if (do_log)
 +            {
 +                print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
 +            }
 +            print_ebin(outf->fp_ene, do_ene, FALSE, FALSE,
 +                       do_log ? fplog : NULL, step, step, eprNORMAL,
 +                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +        }
 +
 +        /* Stop when the maximum force lies below tolerance.
 +         * If we have reached machine precision, converged is already set to true.
 +         */
 +        converged = converged || (s_min->fmax < inputrec->em_tol);
 +
 +    } /* End of the loop */
 +
 +    if (converged)
 +    {
 +        step--; /* we never took that last step in this case */
 +
 +    }
 +    if (s_min->fmax > inputrec->em_tol)
 +    {
 +        if (MASTER(cr))
 +        {
 +            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
 +            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
 +        }
 +        converged = FALSE;
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        /* If we printed energy and/or logfile last step (which was the last step)
 +         * we don't have to do it again, but otherwise print the final values.
 +         */
 +        if (!do_log)
 +        {
 +            /* Write final value to log since we didn't do anything the last step */
 +            print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
 +        }
 +        if (!do_ene || !do_log)
 +        {
 +            /* Write final energy file entries */
 +            print_ebin(outf->fp_ene, !do_ene, FALSE, FALSE,
 +                       !do_log ? fplog : NULL, step, step, eprNORMAL,
 +                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +        }
 +    }
 +
 +    /* Print some stuff... */
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
 +    }
 +
 +    /* IMPORTANT!
 +     * For accurate normal mode calculation it is imperative that we
 +     * store the last conformation into the full precision binary trajectory.
 +     *
 +     * However, we should only do it if we did NOT already write this step
 +     * above (which we did if do_x or do_f was true).
 +     */
 +    do_x = !do_per_step(step, inputrec->nstxout);
 +    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
 +
 +    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
 +                  top_global, inputrec, step,
 +                  s_min, state_global, f_global);
 +
 +    fnormn = s_min->fnorm/sqrt(state_global->natoms);
 +
 +    if (MASTER(cr))
 +    {
 +        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
 +                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
 +        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
 +                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
 +
 +        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
 +    }
 +
 +    finish_em(fplog, cr, outf, runtime, wcycle);
 +
 +    /* To print the actual number of steps we needed somewhere */
 +    runtime->nsteps_done = step;
 +
 +    return 0;
 +} /* That's all folks */
 +
 +
 +double do_lbfgs(FILE *fplog, t_commrec *cr,
 +                int nfile, const t_filenm fnm[],
 +                const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
 +                int nstglobalcomm,
 +                gmx_vsite_t *vsite, gmx_constr_t constr,
 +                int stepout,
 +                t_inputrec *inputrec,
 +                gmx_mtop_t *top_global, t_fcdata *fcd,
 +                t_state *state,
 +                t_mdatoms *mdatoms,
 +                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +                gmx_edsam_t ed,
 +                t_forcerec *fr,
 +                int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
 +                gmx_membed_t membed,
 +                real cpt_period, real max_hours,
 +                const char *deviceOptions,
 +                unsigned long Flags,
 +                gmx_runtime_t *runtime)
 +{
 +    static const char *LBFGS = "Low-Memory BFGS Minimizer";
 +    em_state_t         ems;
 +    gmx_localtop_t    *top;
 +    gmx_enerdata_t    *enerd;
 +    rvec              *f;
 +    gmx_global_stat_t  gstat;
 +    t_graph           *graph;
 +    rvec              *f_global;
 +    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
 +    double             stepsize, gpa, gpb, gpc, tmp, minstep;
 +    real              *rho, *alpha, *ff, *xx, *p, *s, *lastx, *lastf, **dx, **dg;
 +    real              *xa, *xb, *xc, *fa, *fb, *fc, *xtmp, *ftmp;
 +    real               a, b, c, maxdelta, delta;
 +    real               diag, Epot0, Epot, EpotA, EpotB, EpotC;
 +    real               dgdx, dgdg, sq, yr, beta;
 +    t_mdebin          *mdebin;
 +    gmx_bool           converged, first;
 +    rvec               mu_tot;
 +    real               fnorm, fmax;
 +    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
 +    tensor             vir, pres;
 +    int                start, end, number_steps;
 +    gmx_mdoutf_t      *outf;
 +    int                i, k, m, n, nfmax, gf, step;
 +    int                mdof_flags;
 +    /* not used */
 +    real               terminate;
 +
 +    if (PAR(cr))
 +    {
 +        gmx_fatal(FARGS, "Cannot do parallel L-BFGS Minimization - yet.\n");
 +    }
 +
 +    if (NULL != constr)
 +    {
 +        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
 +    }
 +
 +    n        = 3*state->natoms;
 +    nmaxcorr = inputrec->nbfgscorr;
 +
 +    /* Allocate memory */
 +    /* Use pointers to real so we dont have to loop over both atoms and
 +     * dimensions all the time...
 +     * x/f are allocated as rvec *, so make new x0/f0 pointers-to-real
 +     * that point to the same memory.
 +     */
 +    snew(xa, n);
 +    snew(xb, n);
 +    snew(xc, n);
 +    snew(fa, n);
 +    snew(fb, n);
 +    snew(fc, n);
 +    snew(frozen, n);
 +
 +    snew(p, n);
 +    snew(lastx, n);
 +    snew(lastf, n);
 +    snew(rho, nmaxcorr);
 +    snew(alpha, nmaxcorr);
 +
 +    snew(dx, nmaxcorr);
 +    for (i = 0; i < nmaxcorr; i++)
 +    {
 +        snew(dx[i], n);
 +    }
 +
 +    snew(dg, nmaxcorr);
 +    for (i = 0; i < nmaxcorr; i++)
 +    {
 +        snew(dg[i], n);
 +    }
 +
 +    step  = 0;
 +    neval = 0;
 +
 +    /* Init em */
 +    init_em(fplog, LBFGS, cr, inputrec,
 +            state, top_global, &ems, &top, &f, &f_global,
 +            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
 +            nfile, fnm, &outf, &mdebin);
 +    /* Do_lbfgs is not completely updated like do_steep and do_cg,
 +     * so we free some memory again.
 +     */
 +    sfree(ems.s.x);
 +    sfree(ems.f);
 +
 +    xx = (real *)state->x;
 +    ff = (real *)f;
 +
 +    start = mdatoms->start;
 +    end   = mdatoms->homenr + start;
 +
 +    /* Print to log file */
 +    print_em_start(fplog, cr, runtime, wcycle, LBFGS);
 +
 +    do_log = do_ene = do_x = do_f = TRUE;
 +
 +    /* Max number of steps */
 +    number_steps = inputrec->nsteps;
 +
 +    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
 +    gf = 0;
 +    for (i = start; i < end; i++)
 +    {
 +        if (mdatoms->cFREEZE)
 +        {
 +            gf = mdatoms->cFREEZE[i];
 +        }
 +        for (m = 0; m < DIM; m++)
 +        {
 +            frozen[3*i+m] = inputrec->opts.nFreeze[gf][m];
 +        }
 +    }
 +    if (MASTER(cr))
 +    {
 +        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
 +    }
 +    if (fplog)
 +    {
 +        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
 +    }
 +
 +    if (vsite)
 +    {
 +        construct_vsites(fplog, vsite, state->x, nrnb, 1, NULL,
 +                         top->idef.iparams, top->idef.il,
 +                         fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +    }
 +
 +    /* Call the force routine and some auxiliary (neighboursearching etc.) */
 +    /* do_force always puts the charge groups in the box and shifts again
 +     * We do not unshift, so molecules are always whole
 +     */
 +    neval++;
 +    ems.s.x = state->x;
 +    ems.f   = f;
 +    evaluate_energy(fplog, bVerbose, cr,
 +                    state, top_global, &ems, top,
 +                    inputrec, nrnb, wcycle, gstat,
 +                    vsite, constr, fcd, graph, mdatoms, fr,
 +                    mu_tot, enerd, vir, pres, -1, TRUE);
 +    where();
 +
 +    if (MASTER(cr))
 +    {
 +        /* Copy stuff to the energy bin for easy printing etc. */
 +        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
 +                   mdatoms->tmass, enerd, state, inputrec->fepvals, inputrec->expandedvals, state->box,
 +                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +
 +        print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
 +        print_ebin(outf->fp_ene, TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
 +                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +    }
 +    where();
 +
 +    /* This is the starting energy */
 +    Epot = enerd->term[F_EPOT];
 +
 +    fnorm = ems.fnorm;
 +    fmax  = ems.fmax;
 +    nfmax = ems.a_fmax;
 +
 +    /* Set the initial step.
 +     * since it will be multiplied by the non-normalized search direction
 +     * vector (force vector the first time), we scale it by the
 +     * norm of the force.
 +     */
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
 +        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
 +        fprintf(stderr, "   F-Norm            = %12.5e\n", fnorm/sqrt(state->natoms));
 +        fprintf(stderr, "\n");
 +        /* and copy to the log file too... */
 +        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
 +        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
 +        fprintf(fplog, "   F-Norm            = %12.5e\n", fnorm/sqrt(state->natoms));
 +        fprintf(fplog, "\n");
 +    }
 +
 +    point = 0;
 +    for (i = 0; i < n; i++)
 +    {
 +        if (!frozen[i])
 +        {
 +            dx[point][i] = ff[i]; /* Initial search direction */
 +        }
 +        else
 +        {
 +            dx[point][i] = 0;
 +        }
 +    }
 +
 +    stepsize  = 1.0/fnorm;
 +    converged = FALSE;
 +
 +    /* Start the loop over BFGS steps.
 +     * Each successful step is counted, and we continue until
 +     * we either converge or reach the max number of steps.
 +     */
 +
 +    ncorr = 0;
 +
 +    /* Set the gradient from the force */
 +    converged = FALSE;
 +    for (step = 0; (number_steps < 0 || (number_steps >= 0 && step <= number_steps)) && !converged; step++)
 +    {
 +
 +        /* Write coordinates if necessary */
 +        do_x = do_per_step(step, inputrec->nstxout);
 +        do_f = do_per_step(step, inputrec->nstfout);
 +
 +        mdof_flags = 0;
 +        if (do_x)
 +        {
 +            mdof_flags |= MDOF_X;
 +        }
 +
 +        if (do_f)
 +        {
 +            mdof_flags |= MDOF_F;
 +        }
 +
 +        write_traj(fplog, cr, outf, mdof_flags,
 +                   top_global, step, (real)step, state, state, f, f, NULL, NULL);
 +
 +        /* Do the linesearching in the direction dx[point][0..(n-1)] */
 +
 +        /* pointer to current direction - point=0 first time here */
 +        s = dx[point];
 +
 +        /* calculate line gradient */
 +        for (gpa = 0, i = 0; i < n; i++)
 +        {
 +            gpa -= s[i]*ff[i];
 +        }
 +
 +        /* Calculate minimum allowed stepsize, before the average (norm)
 +         * relative change in coordinate is smaller than precision
 +         */
 +        for (minstep = 0, i = 0; i < n; i++)
 +        {
 +            tmp = fabs(xx[i]);
 +            if (tmp < 1.0)
 +            {
 +                tmp = 1.0;
 +            }
 +            tmp      = s[i]/tmp;
 +            minstep += tmp*tmp;
 +        }
 +        minstep = GMX_REAL_EPS/sqrt(minstep/n);
 +
 +        if (stepsize < minstep)
 +        {
 +            converged = TRUE;
 +            break;
 +        }
 +
 +        /* Store old forces and coordinates */
 +        for (i = 0; i < n; i++)
 +        {
 +            lastx[i] = xx[i];
 +            lastf[i] = ff[i];
 +        }
 +        Epot0 = Epot;
 +
 +        first = TRUE;
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            xa[i] = xx[i];
 +        }
 +
 +        /* Take a step downhill.
 +         * In theory, we should minimize the function along this direction.
 +         * That is quite possible, but it turns out to take 5-10 function evaluations
 +         * for each line. However, we dont really need to find the exact minimum -
 +         * it is much better to start a new BFGS step in a modified direction as soon
 +         * as we are close to it. This will save a lot of energy evaluations.
 +         *
 +         * In practice, we just try to take a single step.
 +         * If it worked (i.e. lowered the energy), we increase the stepsize but
 +         * the continue straight to the next BFGS step without trying to find any minimum.
 +         * If it didn't work (higher energy), there must be a minimum somewhere between
 +         * the old position and the new one.
 +         *
 +         * Due to the finite numerical accuracy, it turns out that it is a good idea
 +         * to even accept a SMALL increase in energy, if the derivative is still downhill.
 +         * This leads to lower final energies in the tests I've done. / Erik
 +         */
 +        foundlower = FALSE;
 +        EpotA      = Epot0;
 +        a          = 0.0;
 +        c          = a + stepsize; /* reference position along line is zero */
 +
 +        /* Check stepsize first. We do not allow displacements
 +         * larger than emstep.
 +         */
 +        do
 +        {
 +            c        = a + stepsize;
 +            maxdelta = 0;
 +            for (i = 0; i < n; i++)
 +            {
 +                delta = c*s[i];
 +                if (delta > maxdelta)
 +                {
 +                    maxdelta = delta;
 +                }
 +            }
 +            if (maxdelta > inputrec->em_stepsize)
 +            {
 +                stepsize *= 0.1;
 +            }
 +        }
 +        while (maxdelta > inputrec->em_stepsize);
 +
 +        /* Take a trial step */
 +        for (i = 0; i < n; i++)
 +        {
 +            xc[i] = lastx[i] + c*s[i];
 +        }
 +
 +        neval++;
 +        /* Calculate energy for the trial step */
 +        ems.s.x = (rvec *)xc;
 +        ems.f   = (rvec *)fc;
 +        evaluate_energy(fplog, bVerbose, cr,
 +                        state, top_global, &ems, top,
 +                        inputrec, nrnb, wcycle, gstat,
 +                        vsite, constr, fcd, graph, mdatoms, fr,
 +                        mu_tot, enerd, vir, pres, step, FALSE);
 +        EpotC = ems.epot;
 +
 +        /* Calc derivative along line */
 +        for (gpc = 0, i = 0; i < n; i++)
 +        {
 +            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
 +        }
 +        /* Sum the gradient along the line across CPUs */
 +        if (PAR(cr))
 +        {
 +            gmx_sumd(1, &gpc, cr);
 +        }
 +
 +        /* This is the max amount of increase in energy we tolerate */
 +        tmp = sqrt(GMX_REAL_EPS)*fabs(EpotA);
 +
 +        /* Accept the step if the energy is lower, or if it is not significantly higher
 +         * and the line derivative is still negative.
 +         */
 +        if (EpotC < EpotA || (gpc < 0 && EpotC < (EpotA+tmp)))
 +        {
 +            foundlower = TRUE;
 +            /* Great, we found a better energy. Increase step for next iteration
 +             * if we are still going down, decrease it otherwise
 +             */
 +            if (gpc < 0)
 +            {
 +                stepsize *= 1.618034; /* The golden section */
 +            }
 +            else
 +            {
 +                stepsize *= 0.618034; /* 1/golden section */
 +            }
 +        }
 +        else
 +        {
 +            /* New energy is the same or higher. We will have to do some work
 +             * to find a smaller value in the interval. Take smaller step next time!
 +             */
 +            foundlower = FALSE;
 +            stepsize  *= 0.618034;
 +        }
 +
 +        /* OK, if we didn't find a lower value we will have to locate one now - there must
 +         * be one in the interval [a=0,c].
 +         * The same thing is valid here, though: Don't spend dozens of iterations to find
 +         * the line minimum. We try to interpolate based on the derivative at the endpoints,
 +         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
 +         *
 +         * I also have a safeguard for potentially really patological functions so we never
 +         * take more than 20 steps before we give up ...
 +         *
 +         * If we already found a lower value we just skip this step and continue to the update.
 +         */
 +
 +        if (!foundlower)
 +        {
 +
 +            nminstep = 0;
 +            do
 +            {
 +                /* Select a new trial point.
 +                 * If the derivatives at points a & c have different sign we interpolate to zero,
 +                 * otherwise just do a bisection.
 +                 */
 +
 +                if (gpa < 0 && gpc > 0)
 +                {
 +                    b = a + gpa*(a-c)/(gpc-gpa);
 +                }
 +                else
 +                {
 +                    b = 0.5*(a+c);
 +                }
 +
 +                /* safeguard if interpolation close to machine accuracy causes errors:
 +                 * never go outside the interval
 +                 */
 +                if (b <= a || b >= c)
 +                {
 +                    b = 0.5*(a+c);
 +                }
 +
 +                /* Take a trial step */
 +                for (i = 0; i < n; i++)
 +                {
 +                    xb[i] = lastx[i] + b*s[i];
 +                }
 +
 +                neval++;
 +                /* Calculate energy for the trial step */
 +                ems.s.x = (rvec *)xb;
 +                ems.f   = (rvec *)fb;
 +                evaluate_energy(fplog, bVerbose, cr,
 +                                state, top_global, &ems, top,
 +                                inputrec, nrnb, wcycle, gstat,
 +                                vsite, constr, fcd, graph, mdatoms, fr,
 +                                mu_tot, enerd, vir, pres, step, FALSE);
 +                EpotB = ems.epot;
 +
 +                fnorm = ems.fnorm;
 +
 +                for (gpb = 0, i = 0; i < n; i++)
 +                {
 +                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
 +
 +                }
 +                /* Sum the gradient along the line across CPUs */
 +                if (PAR(cr))
 +                {
 +                    gmx_sumd(1, &gpb, cr);
 +                }
 +
 +                /* Keep one of the intervals based on the value of the derivative at the new point */
 +                if (gpb > 0)
 +                {
 +                    /* Replace c endpoint with b */
 +                    EpotC = EpotB;
 +                    c     = b;
 +                    gpc   = gpb;
 +                    /* swap coord pointers b/c */
 +                    xtmp = xb;
 +                    ftmp = fb;
 +                    xb   = xc;
 +                    fb   = fc;
 +                    xc   = xtmp;
 +                    fc   = ftmp;
 +                }
 +                else
 +                {
 +                    /* Replace a endpoint with b */
 +                    EpotA = EpotB;
 +                    a     = b;
 +                    gpa   = gpb;
 +                    /* swap coord pointers a/b */
 +                    xtmp = xb;
 +                    ftmp = fb;
 +                    xb   = xa;
 +                    fb   = fa;
 +                    xa   = xtmp;
 +                    fa   = ftmp;
 +                }
 +
 +                /*
 +                 * Stop search as soon as we find a value smaller than the endpoints,
 +                 * or if the tolerance is below machine precision.
 +                 * Never run more than 20 steps, no matter what.
 +                 */
 +                nminstep++;
 +            }
 +            while ((EpotB > EpotA || EpotB > EpotC) && (nminstep < 20));
 +
 +            if (fabs(EpotB-Epot0) < GMX_REAL_EPS || nminstep >= 20)
 +            {
 +                /* OK. We couldn't find a significantly lower energy.
 +                 * If ncorr==0 this was steepest descent, and then we give up.
 +                 * If not, reset memory to restart as steepest descent before quitting.
 +                 */
 +                if (ncorr == 0)
 +                {
 +                    /* Converged */
 +                    converged = TRUE;
 +                    break;
 +                }
 +                else
 +                {
 +                    /* Reset memory */
 +                    ncorr = 0;
 +                    /* Search in gradient direction */
 +                    for (i = 0; i < n; i++)
 +                    {
 +                        dx[point][i] = ff[i];
 +                    }
 +                    /* Reset stepsize */
 +                    stepsize = 1.0/fnorm;
 +                    continue;
 +                }
 +            }
 +
 +            /* Select min energy state of A & C, put the best in xx/ff/Epot
 +             */
 +            if (EpotC < EpotA)
 +            {
 +                Epot = EpotC;
 +                /* Use state C */
 +                for (i = 0; i < n; i++)
 +                {
 +                    xx[i] = xc[i];
 +                    ff[i] = fc[i];
 +                }
 +                stepsize = c;
 +            }
 +            else
 +            {
 +                Epot = EpotA;
 +                /* Use state A */
 +                for (i = 0; i < n; i++)
 +                {
 +                    xx[i] = xa[i];
 +                    ff[i] = fa[i];
 +                }
 +                stepsize = a;
 +            }
 +
 +        }
 +        else
 +        {
 +            /* found lower */
 +            Epot = EpotC;
 +            /* Use state C */
 +            for (i = 0; i < n; i++)
 +            {
 +                xx[i] = xc[i];
 +                ff[i] = fc[i];
 +            }
 +            stepsize = c;
 +        }
 +
 +        /* Update the memory information, and calculate a new
 +         * approximation of the inverse hessian
 +         */
 +
 +        /* Have new data in Epot, xx, ff */
 +        if (ncorr < nmaxcorr)
 +        {
 +            ncorr++;
 +        }
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            dg[point][i]  = lastf[i]-ff[i];
 +            dx[point][i] *= stepsize;
 +        }
 +
 +        dgdg = 0;
 +        dgdx = 0;
 +        for (i = 0; i < n; i++)
 +        {
 +            dgdg += dg[point][i]*dg[point][i];
 +            dgdx += dg[point][i]*dx[point][i];
 +        }
 +
 +        diag = dgdx/dgdg;
 +
 +        rho[point] = 1.0/dgdx;
 +        point++;
 +
 +        if (point >= nmaxcorr)
 +        {
 +            point = 0;
 +        }
 +
 +        /* Update */
 +        for (i = 0; i < n; i++)
 +        {
 +            p[i] = ff[i];
 +        }
 +
 +        cp = point;
 +
 +        /* Recursive update. First go back over the memory points */
 +        for (k = 0; k < ncorr; k++)
 +        {
 +            cp--;
 +            if (cp < 0)
 +            {
 +                cp = ncorr-1;
 +            }
 +
 +            sq = 0;
 +            for (i = 0; i < n; i++)
 +            {
 +                sq += dx[cp][i]*p[i];
 +            }
 +
 +            alpha[cp] = rho[cp]*sq;
 +
 +            for (i = 0; i < n; i++)
 +            {
 +                p[i] -= alpha[cp]*dg[cp][i];
 +            }
 +        }
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            p[i] *= diag;
 +        }
 +
 +        /* And then go forward again */
 +        for (k = 0; k < ncorr; k++)
 +        {
 +            yr = 0;
 +            for (i = 0; i < n; i++)
 +            {
 +                yr += p[i]*dg[cp][i];
 +            }
 +
 +            beta = rho[cp]*yr;
 +            beta = alpha[cp]-beta;
 +
 +            for (i = 0; i < n; i++)
 +            {
 +                p[i] += beta*dx[cp][i];
 +            }
 +
 +            cp++;
 +            if (cp >= ncorr)
 +            {
 +                cp = 0;
 +            }
 +        }
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            if (!frozen[i])
 +            {
 +                dx[point][i] = p[i];
 +            }
 +            else
 +            {
 +                dx[point][i] = 0;
 +            }
 +        }
 +
 +        stepsize = 1.0;
 +
 +        /* Test whether the convergence criterion is met */
 +        get_f_norm_max(cr, &(inputrec->opts), mdatoms, f, &fnorm, &fmax, &nfmax);
 +
 +        /* Print it if necessary */
 +        if (MASTER(cr))
 +        {
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
 +                        step, Epot, fnorm/sqrt(state->natoms), fmax, nfmax+1);
 +            }
 +            /* Store the new (lower) energies */
 +            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
 +                       mdatoms->tmass, enerd, state, inputrec->fepvals, inputrec->expandedvals, state->box,
 +                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +            do_log = do_per_step(step, inputrec->nstlog);
 +            do_ene = do_per_step(step, inputrec->nstenergy);
 +            if (do_log)
 +            {
 +                print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
 +            }
 +            print_ebin(outf->fp_ene, do_ene, FALSE, FALSE,
 +                       do_log ? fplog : NULL, step, step, eprNORMAL,
 +                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +        }
 +
 +        /* Stop when the maximum force lies below tolerance.
 +         * If we have reached machine precision, converged is already set to true.
 +         */
 +
 +        converged = converged || (fmax < inputrec->em_tol);
 +
 +    } /* End of the loop */
 +
 +    if (converged)
 +    {
 +        step--; /* we never took that last step in this case */
 +
 +    }
 +    if (fmax > inputrec->em_tol)
 +    {
 +        if (MASTER(cr))
 +        {
 +            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
 +            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
 +        }
 +        converged = FALSE;
 +    }
 +
 +    /* If we printed energy and/or logfile last step (which was the last step)
 +     * we don't have to do it again, but otherwise print the final values.
 +     */
 +    if (!do_log) /* Write final value to log since we didn't do anythin last step */
 +    {
 +        print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
 +    }
 +    if (!do_ene || !do_log) /* Write final energy file entries */
 +    {
 +        print_ebin(outf->fp_ene, !do_ene, FALSE, FALSE,
 +                   !do_log ? fplog : NULL, step, step, eprNORMAL,
 +                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +    }
 +
 +    /* Print some stuff... */
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
 +    }
 +
 +    /* IMPORTANT!
 +     * For accurate normal mode calculation it is imperative that we
 +     * store the last conformation into the full precision binary trajectory.
 +     *
 +     * However, we should only do it if we did NOT already write this step
 +     * above (which we did if do_x or do_f was true).
 +     */
 +    do_x = !do_per_step(step, inputrec->nstxout);
 +    do_f = !do_per_step(step, inputrec->nstfout);
 +    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
 +                  top_global, inputrec, step,
 +                  &ems, state, f);
 +
 +    if (MASTER(cr))
 +    {
 +        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
 +                        number_steps, Epot, fmax, nfmax, fnorm/sqrt(state->natoms));
 +        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
 +                        number_steps, Epot, fmax, nfmax, fnorm/sqrt(state->natoms));
 +
 +        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
 +    }
 +
 +    finish_em(fplog, cr, outf, runtime, wcycle);
 +
 +    /* To print the actual number of steps we needed somewhere */
 +    runtime->nsteps_done = step;
 +
 +    return 0;
 +} /* That's all folks */
 +
 +
 +double do_steep(FILE *fplog, t_commrec *cr,
 +                int nfile, const t_filenm fnm[],
 +                const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
 +                int nstglobalcomm,
 +                gmx_vsite_t *vsite, gmx_constr_t constr,
 +                int stepout,
 +                t_inputrec *inputrec,
 +                gmx_mtop_t *top_global, t_fcdata *fcd,
 +                t_state *state_global,
 +                t_mdatoms *mdatoms,
 +                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +                gmx_edsam_t ed,
 +                t_forcerec *fr,
 +                int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
 +                gmx_membed_t membed,
 +                real cpt_period, real max_hours,
 +                const char *deviceOptions,
 +                unsigned long Flags,
 +                gmx_runtime_t *runtime)
 +{
 +    const char       *SD = "Steepest Descents";
 +    em_state_t       *s_min, *s_try;
 +    rvec             *f_global;
 +    gmx_localtop_t   *top;
 +    gmx_enerdata_t   *enerd;
 +    rvec             *f;
 +    gmx_global_stat_t gstat;
 +    t_graph          *graph;
 +    real              stepsize, constepsize;
++    real              ustep, fnormn;
 +    gmx_mdoutf_t     *outf;
 +    t_mdebin         *mdebin;
 +    gmx_bool          bDone, bAbort, do_x, do_f;
 +    tensor            vir, pres;
 +    rvec              mu_tot;
 +    int               nsteps;
 +    int               count          = 0;
 +    int               steps_accepted = 0;
 +    /* not used */
 +    real              terminate = 0;
 +
 +    s_min = init_em_state();
 +    s_try = init_em_state();
 +
 +    /* Init em and store the local state in s_try */
 +    init_em(fplog, SD, cr, inputrec,
 +            state_global, top_global, s_try, &top, &f, &f_global,
 +            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
 +            nfile, fnm, &outf, &mdebin);
 +
 +    /* Print to log file  */
 +    print_em_start(fplog, cr, runtime, wcycle, SD);
 +
 +    /* Set variables for stepsize (in nm). This is the largest
 +     * step that we are going to make in any direction.
 +     */
 +    ustep    = inputrec->em_stepsize;
 +    stepsize = 0;
 +
 +    /* Max number of steps  */
 +    nsteps = inputrec->nsteps;
 +
 +    if (MASTER(cr))
 +    {
 +        /* Print to the screen  */
 +        sp_header(stderr, SD, inputrec->em_tol, nsteps);
 +    }
 +    if (fplog)
 +    {
 +        sp_header(fplog, SD, inputrec->em_tol, nsteps);
 +    }
 +
 +    /**** HERE STARTS THE LOOP ****
 +     * count is the counter for the number of steps
 +     * bDone will be TRUE when the minimization has converged
 +     * bAbort will be TRUE when nsteps steps have been performed or when
 +     * the stepsize becomes smaller than is reasonable for machine precision
 +     */
 +    count  = 0;
 +    bDone  = FALSE;
 +    bAbort = FALSE;
 +    while (!bDone && !bAbort)
 +    {
 +        bAbort = (nsteps >= 0) && (count == nsteps);
 +
 +        /* set new coordinates, except for first step */
 +        if (count > 0)
 +        {
 +            do_em_step(cr, inputrec, mdatoms, fr->bMolPBC,
 +                       s_min, stepsize, s_min->f, s_try,
 +                       constr, top, nrnb, wcycle, count);
 +        }
 +
 +        evaluate_energy(fplog, bVerbose, cr,
 +                        state_global, top_global, s_try, top,
 +                        inputrec, nrnb, wcycle, gstat,
 +                        vsite, constr, fcd, graph, mdatoms, fr,
 +                        mu_tot, enerd, vir, pres, count, count == 0);
 +
 +        if (MASTER(cr))
 +        {
 +            print_ebin_header(fplog, count, count, s_try->s.lambda[efptFEP]);
 +        }
 +
 +        if (count == 0)
 +        {
 +            s_min->epot = s_try->epot + 1;
 +        }
 +
 +        /* Print it if necessary  */
 +        if (MASTER(cr))
 +        {
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
 +                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
 +                        (s_try->epot < s_min->epot) ? '\n' : '\r');
 +            }
 +
 +            if (s_try->epot < s_min->epot)
 +            {
 +                /* Store the new (lower) energies  */
 +                upd_mdebin(mdebin, FALSE, FALSE, (double)count,
 +                           mdatoms->tmass, enerd, &s_try->s, inputrec->fepvals, inputrec->expandedvals,
 +                           s_try->s.box, NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +                print_ebin(outf->fp_ene, TRUE,
 +                           do_per_step(steps_accepted, inputrec->nstdisreout),
 +                           do_per_step(steps_accepted, inputrec->nstorireout),
 +                           fplog, count, count, eprNORMAL, TRUE,
 +                           mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +                fflush(fplog);
 +            }
 +        }
 +
 +        /* Now if the new energy is smaller than the previous...
 +         * or if this is the first step!
 +         * or if we did random steps!
 +         */
 +
 +        if ( (count == 0) || (s_try->epot < s_min->epot) )
 +        {
 +            steps_accepted++;
 +
 +            /* Test whether the convergence criterion is met...  */
 +            bDone = (s_try->fmax < inputrec->em_tol);
 +
 +            /* Copy the arrays for force, positions and energy  */
 +            /* The 'Min' array always holds the coords and forces of the minimal
 +               sampled energy  */
 +            swap_em_state(s_min, s_try);
 +            if (count > 0)
 +            {
 +                ustep *= 1.2;
 +            }
 +
 +            /* Write to trn, if necessary */
 +            do_x = do_per_step(steps_accepted, inputrec->nstxout);
 +            do_f = do_per_step(steps_accepted, inputrec->nstfout);
 +            write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
 +                          top_global, inputrec, count,
 +                          s_min, state_global, f_global);
 +        }
 +        else
 +        {
 +            /* If energy is not smaller make the step smaller...  */
 +            ustep *= 0.5;
 +
 +            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
 +            {
 +                /* Reload the old state */
 +                em_dd_partition_system(fplog, count, cr, top_global, inputrec,
 +                                       s_min, top, mdatoms, fr, vsite, constr,
 +                                       nrnb, wcycle);
 +            }
 +        }
 +
 +        /* Determine new step  */
 +        stepsize = ustep/s_min->fmax;
 +
 +        /* Check if stepsize is too small, with 1 nm as a characteristic length */
 +#ifdef GMX_DOUBLE
 +        if (count == nsteps || ustep < 1e-12)
 +#else
 +        if (count == nsteps || ustep < 1e-6)
 +#endif
 +        {
 +            if (MASTER(cr))
 +            {
 +                warn_step(stderr, inputrec->em_tol, count == nsteps, constr != NULL);
 +                warn_step(fplog, inputrec->em_tol, count == nsteps, constr != NULL);
 +            }
 +            bAbort = TRUE;
 +        }
 +
 +        count++;
 +    } /* End of the loop  */
 +
 +    /* Print some shit...  */
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
 +    }
 +    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout, ftp2fn(efSTO, nfile, fnm),
 +                  top_global, inputrec, count,
 +                  s_min, state_global, f_global);
 +
 +    fnormn = s_min->fnorm/sqrt(state_global->natoms);
 +
 +    if (MASTER(cr))
 +    {
 +        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
 +                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
 +        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
 +                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
 +    }
 +
 +    finish_em(fplog, cr, outf, runtime, wcycle);
 +
 +    /* To print the actual number of steps we needed somewhere */
 +    inputrec->nsteps = count;
 +
 +    runtime->nsteps_done = count;
 +
 +    return 0;
 +} /* That's all folks */
 +
 +
 +double do_nm(FILE *fplog, t_commrec *cr,
 +             int nfile, const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite, gmx_constr_t constr,
 +             int stepout,
 +             t_inputrec *inputrec,
 +             gmx_mtop_t *top_global, t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed,
 +             t_forcerec *fr,
 +             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
 +             gmx_membed_t membed,
 +             real cpt_period, real max_hours,
 +             const char *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    const char          *NM = "Normal Mode Analysis";
 +    gmx_mdoutf_t        *outf;
 +    int                  natoms, atom, d;
 +    int                  nnodes, node;
 +    rvec                *f_global;
 +    gmx_localtop_t      *top;
 +    gmx_enerdata_t      *enerd;
 +    rvec                *f;
 +    gmx_global_stat_t    gstat;
 +    t_graph             *graph;
 +    real                 t, t0, lambda, lam0;
 +    gmx_bool             bNS;
 +    tensor               vir, pres;
 +    rvec                 mu_tot;
 +    rvec                *fneg, *dfdx;
 +    gmx_bool             bSparse; /* use sparse matrix storage format */
 +    size_t               sz;
 +    gmx_sparsematrix_t * sparse_matrix           = NULL;
 +    real           *     full_matrix             = NULL;
 +    em_state_t       *   state_work;
 +
 +    /* added with respect to mdrun */
 +    int        i, j, k, row, col;
 +    real       der_range = 10.0*sqrt(GMX_REAL_EPS);
 +    real       x_min;
 +    real       fnorm, fmax;
 +
 +    if (constr != NULL)
 +    {
 +        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
 +    }
 +
 +    state_work = init_em_state();
 +
 +    /* Init em and store the local state in state_minimum */
 +    init_em(fplog, NM, cr, inputrec,
 +            state_global, top_global, state_work, &top,
 +            &f, &f_global,
 +            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
 +            nfile, fnm, &outf, NULL);
 +
 +    natoms = top_global->natoms;
 +    snew(fneg, natoms);
 +    snew(dfdx, natoms);
 +
 +#ifndef GMX_DOUBLE
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr,
 +                "NOTE: This version of Gromacs has been compiled in single precision,\n"
 +                "      which MIGHT not be accurate enough for normal mode analysis.\n"
 +                "      Gromacs now uses sparse matrix storage, so the memory requirements\n"
 +                "      are fairly modest even if you recompile in double precision.\n\n");
 +    }
 +#endif
 +
 +    /* Check if we can/should use sparse storage format.
 +     *
 +     * Sparse format is only useful when the Hessian itself is sparse, which it
 +     * will be when we use a cutoff.
 +     * For small systems (n<1000) it is easier to always use full matrix format, though.
 +     */
 +    if (EEL_FULL(fr->eeltype) || fr->rlist == 0.0)
 +    {
 +        fprintf(stderr, "Non-cutoff electrostatics used, forcing full Hessian format.\n");
 +        bSparse = FALSE;
 +    }
 +    else if (top_global->natoms < 1000)
 +    {
 +        fprintf(stderr, "Small system size (N=%d), using full Hessian format.\n", top_global->natoms);
 +        bSparse = FALSE;
 +    }
 +    else
 +    {
 +        fprintf(stderr, "Using compressed symmetric sparse Hessian format.\n");
 +        bSparse = TRUE;
 +    }
 +
 +    sz = DIM*top_global->natoms;
 +
 +    fprintf(stderr, "Allocating Hessian memory...\n\n");
 +
 +    if (bSparse)
 +    {
 +        sparse_matrix = gmx_sparsematrix_init(sz);
 +        sparse_matrix->compressed_symmetric = TRUE;
 +    }
 +    else
 +    {
 +        snew(full_matrix, sz*sz);
 +    }
 +
 +    /* Initial values */
 +    t0           = inputrec->init_t;
 +    lam0         = inputrec->fepvals->init_lambda;
 +    t            = t0;
 +    lambda       = lam0;
 +
 +    init_nrnb(nrnb);
 +
 +    where();
 +
 +    /* Write start time and temperature */
 +    print_em_start(fplog, cr, runtime, wcycle, NM);
 +
 +    /* fudge nr of steps to nr of atoms */
 +    inputrec->nsteps = natoms*2;
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "starting normal mode calculation '%s'\n%d steps.\n\n",
 +                *(top_global->name), (int)inputrec->nsteps);
 +    }
 +
 +    nnodes = cr->nnodes;
 +
 +    /* Make evaluate_energy do a single node force calculation */
 +    cr->nnodes = 1;
 +    evaluate_energy(fplog, bVerbose, cr,
 +                    state_global, top_global, state_work, top,
 +                    inputrec, nrnb, wcycle, gstat,
 +                    vsite, constr, fcd, graph, mdatoms, fr,
 +                    mu_tot, enerd, vir, pres, -1, TRUE);
 +    cr->nnodes = nnodes;
 +
 +    /* if forces are not small, warn user */
 +    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, state_work);
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "Maximum force:%12.5e\n", state_work->fmax);
 +        if (state_work->fmax > 1.0e-3)
 +        {
 +            fprintf(stderr, "Maximum force probably not small enough to");
 +            fprintf(stderr, " ensure that you are in an \nenergy well. ");
 +            fprintf(stderr, "Be aware that negative eigenvalues may occur");
 +            fprintf(stderr, " when the\nresulting matrix is diagonalized.\n");
 +        }
 +    }
 +
 +    /***********************************************************
 +     *
 +     *      Loop over all pairs in matrix
 +     *
 +     *      do_force called twice. Once with positive and
 +     *      once with negative displacement
 +     *
 +     ************************************************************/
 +
 +    /* Steps are divided one by one over the nodes */
 +    for (atom = cr->nodeid; atom < natoms; atom += nnodes)
 +    {
 +
 +        for (d = 0; d < DIM; d++)
 +        {
 +            x_min = state_work->s.x[atom][d];
 +
 +            state_work->s.x[atom][d] = x_min - der_range;
 +
 +            /* Make evaluate_energy do a single node force calculation */
 +            cr->nnodes = 1;
 +            evaluate_energy(fplog, bVerbose, cr,
 +                            state_global, top_global, state_work, top,
 +                            inputrec, nrnb, wcycle, gstat,
 +                            vsite, constr, fcd, graph, mdatoms, fr,
 +                            mu_tot, enerd, vir, pres, atom*2, FALSE);
 +
 +            for (i = 0; i < natoms; i++)
 +            {
 +                copy_rvec(state_work->f[i], fneg[i]);
 +            }
 +
 +            state_work->s.x[atom][d] = x_min + der_range;
 +
 +            evaluate_energy(fplog, bVerbose, cr,
 +                            state_global, top_global, state_work, top,
 +                            inputrec, nrnb, wcycle, gstat,
 +                            vsite, constr, fcd, graph, mdatoms, fr,
 +                            mu_tot, enerd, vir, pres, atom*2+1, FALSE);
 +            cr->nnodes = nnodes;
 +
 +            /* x is restored to original */
 +            state_work->s.x[atom][d] = x_min;
 +
 +            for (j = 0; j < natoms; j++)
 +            {
 +                for (k = 0; (k < DIM); k++)
 +                {
 +                    dfdx[j][k] =
 +                        -(state_work->f[j][k] - fneg[j][k])/(2*der_range);
 +                }
 +            }
 +
 +            if (!MASTER(cr))
 +            {
 +#ifdef GMX_MPI
 +#ifdef GMX_DOUBLE
 +#define mpi_type MPI_DOUBLE
 +#else
 +#define mpi_type MPI_FLOAT
 +#endif
 +                MPI_Send(dfdx[0], natoms*DIM, mpi_type, MASTERNODE(cr), cr->nodeid,
 +                         cr->mpi_comm_mygroup);
 +#endif
 +            }
 +            else
 +            {
 +                for (node = 0; (node < nnodes && atom+node < natoms); node++)
 +                {
 +                    if (node > 0)
 +                    {
 +#ifdef GMX_MPI
 +                        MPI_Status stat;
 +                        MPI_Recv(dfdx[0], natoms*DIM, mpi_type, node, node,
 +                                 cr->mpi_comm_mygroup, &stat);
 +#undef mpi_type
 +#endif
 +                    }
 +
 +                    row = (atom + node)*DIM + d;
 +
 +                    for (j = 0; j < natoms; j++)
 +                    {
 +                        for (k = 0; k < DIM; k++)
 +                        {
 +                            col = j*DIM + k;
 +
 +                            if (bSparse)
 +                            {
 +                                if (col >= row && dfdx[j][k] != 0.0)
 +                                {
 +                                    gmx_sparsematrix_increment_value(sparse_matrix,
 +                                                                     row, col, dfdx[j][k]);
 +                                }
 +                            }
 +                            else
 +                            {
 +                                full_matrix[row*sz+col] = dfdx[j][k];
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +
 +            if (bVerbose && fplog)
 +            {
 +                fflush(fplog);
 +            }
 +        }
 +        /* write progress */
 +        if (MASTER(cr) && bVerbose)
 +        {
 +            fprintf(stderr, "\rFinished step %d out of %d",
 +                    min(atom+nnodes, natoms), natoms);
 +            fflush(stderr);
 +        }
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\n\nWriting Hessian...\n");
 +        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
 +    }
 +
 +    finish_em(fplog, cr, outf, runtime, wcycle);
 +
 +    runtime->nsteps_done = natoms*2;
 +
 +    return 0;
 +}
diff --combined src/programs/mdrun/md.c
index 81d9a2ba615439397c765819602062f70eaa062f,0000000000000000000000000000000000000000..f972179326c42d49df7b8cfdd37d9836b5d9ce23
mode 100644,000000..100644
--- /dev/null
@@@ -1,2225 -1,0 +1,2217 @@@
-     real              mu_aver = 0, dvdl;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "vcm.h"
 +#include "mdebin.h"
 +#include "nrnb.h"
 +#include "calcmu.h"
 +#include "index.h"
 +#include "vsite.h"
 +#include "update.h"
 +#include "ns.h"
 +#include "trnio.h"
 +#include "xtcio.h"
 +#include "mdrun.h"
 +#include "md_support.h"
 +#include "md_logging.h"
 +#include "confio.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "xvgr.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "xmdrun.h"
 +#include "ionize.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "partdec.h"
 +#include "topsort.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "shellfc.h"
 +#include "compute_io.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "txtdump.h"
 +#include "string2.h"
 +#include "pme_loadbal.h"
 +#include "bondf.h"
 +#include "membed.h"
 +#include "types/nlistheuristics.h"
 +#include "types/iteratedconstraints.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +#include "gromacs/utility/gmxmpi.h"
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +static void reset_all_counters(FILE *fplog, t_commrec *cr,
 +                               gmx_large_int_t step,
 +                               gmx_large_int_t *step_rel, t_inputrec *ir,
 +                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
 +                               gmx_runtime_t *runtime,
 +                               nbnxn_cuda_ptr_t cu_nbv)
 +{
 +    char sbuf[STEPSTRSIZE];
 +
 +    /* Reset all the counters related to performance over the run */
 +    md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
 +                  gmx_step_str(step, sbuf));
 +
 +    if (cu_nbv)
 +    {
 +        nbnxn_cuda_reset_timings(cu_nbv);
 +    }
 +
 +    wallcycle_stop(wcycle, ewcRUN);
 +    wallcycle_reset_all(wcycle);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        reset_dd_statistics_counters(cr->dd);
 +    }
 +    init_nrnb(nrnb);
 +    ir->init_step += *step_rel;
 +    ir->nsteps    -= *step_rel;
 +    *step_rel      = 0;
 +    wallcycle_start(wcycle, ewcRUN);
 +    runtime_start(runtime);
 +    print_date_and_time(fplog, cr->nodeid, "Restarted time", runtime);
 +}
 +
 +double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite, gmx_constr_t constr,
 +             int stepout, t_inputrec *ir,
 +             gmx_mtop_t *top_global,
 +             t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed, t_forcerec *fr,
 +             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
 +             real cpt_period, real max_hours,
 +             const char *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    gmx_mdoutf_t   *outf;
 +    gmx_large_int_t step, step_rel;
 +    double          run_time;
 +    double          t, t0, lam0[efptNR];
 +    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEner;
 +    gmx_bool        bNS, bNStList, bSimAnn, bStopCM, bRerunMD, bNotLastFrame = FALSE,
 +                    bFirstStep, bStateFromCP, bStateFromTPX, bInitStep, bLastStep,
 +                    bBornRadii, bStartingFromCpt;
 +    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 +    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
 +                      bForceUpdate = FALSE, bCPT;
 +    int               mdof_flags;
 +    gmx_bool          bMasterState;
 +    int               force_flags, cglo_flags;
 +    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
 +    int               i, m;
 +    t_trxstatus      *status;
 +    rvec              mu_tot;
 +    t_vcm            *vcm;
 +    t_state          *bufstate = NULL;
 +    matrix           *scale_tot, pcoupl_mu, M, ebox;
 +    gmx_nlheur_t      nlh;
 +    t_trxframe        rerun_fr;
 +    gmx_repl_ex_t     repl_ex = NULL;
 +    int               nchkpt  = 1;
 +    gmx_localtop_t   *top;
 +    t_mdebin         *mdebin = NULL;
 +    df_history_t      df_history;
 +    t_state          *state    = NULL;
 +    rvec             *f_global = NULL;
 +    int               n_xtc    = -1;
 +    rvec             *x_xtc    = NULL;
 +    gmx_enerdata_t   *enerd;
 +    rvec             *f = NULL;
 +    gmx_global_stat_t gstat;
 +    gmx_update_t      upd   = NULL;
 +    t_graph          *graph = NULL;
 +    globsig_t         gs;
 +    gmx_rng_t         mcrng = NULL;
 +    gmx_bool          bFFscan;
 +    gmx_groups_t     *groups;
 +    gmx_ekindata_t   *ekind, *ekind_save;
 +    gmx_shellfc_t     shellfc;
 +    int               count, nconverged = 0;
 +    real              timestep = 0;
 +    double            tcount   = 0;
 +    gmx_bool          bIonize  = FALSE;
 +    gmx_bool          bTCR     = FALSE, bConverged = TRUE, bOK, bSumEkinhOld, bExchanged;
 +    gmx_bool          bAppend;
 +    gmx_bool          bResetCountersHalfMaxH = FALSE;
 +    gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
 +    gmx_bool          bUpdateDoLR;
-                     dvdl = 0;
-                     update_constraints(fplog, step, &dvdl, ir, ekind, mdatoms,
++    real              mu_aver = 0, dvdl_constr;
 +    int               a0, a1, gnx = 0, ii;
 +    atom_id          *grpindex = NULL;
 +    char             *grpname;
 +    t_coupl_rec      *tcr     = NULL;
 +    rvec             *xcopy   = NULL, *vcopy = NULL, *cbuf = NULL;
 +    matrix            boxcopy = {{0}}, lastbox;
 +    tensor            tmpvir;
 +    real              fom, oldfom, veta_save, pcurr, scalevir, tracevir;
 +    real              vetanew = 0;
 +    int               lamnew  = 0;
 +    /* for FEP */
 +    int               nstfep;
 +    real              rate;
 +    double            cycles;
 +    real              saved_conserved_quantity = 0;
 +    real              last_ekin                = 0;
 +    int               iter_i;
 +    t_extmass         MassQ;
 +    int             **trotter_seq;
 +    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 +    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
 +    gmx_iterate_t     iterate;
 +    gmx_large_int_t   multisim_nsteps = -1;                        /* number of steps to do  before first multisim
 +                                                                      simulation stops. If equal to zero, don't
 +                                                                      communicate any more between multisims.*/
 +    /* PME load balancing data for GPU kernels */
 +    pme_load_balancing_t pme_loadbal = NULL;
 +    double               cycles_pmes;
 +    gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
 +
 +#ifdef GMX_FAHCORE
 +    /* Temporary addition for FAHCORE checkpointing */
 +    int chkpt_ret;
 +#endif
 +
 +    /* Check for special mdrun options */
 +    bRerunMD = (Flags & MD_RERUN);
 +    bIonize  = (Flags & MD_IONIZE);
 +    bFFscan  = (Flags & MD_FFSCAN);
 +    bAppend  = (Flags & MD_APPENDFILES);
 +    if (Flags & MD_RESETCOUNTERSHALFWAY)
 +    {
 +        if (ir->nsteps > 0)
 +        {
 +            /* Signal to reset the counters half the simulation steps. */
 +            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
 +        }
 +        /* Signal to reset the counters halfway the simulation time. */
 +        bResetCountersHalfMaxH = (max_hours > 0);
 +    }
 +
 +    /* md-vv uses averaged full step velocities for T-control
 +       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 +       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 +    bVV = EI_VV(ir->eI);
 +    if (bVV) /* to store the initial velocities while computing virial */
 +    {
 +        snew(cbuf, top_global->natoms);
 +    }
 +    /* all the iteratative cases - only if there are constraints */
 +    bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
 +    gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
 +                                          false in this step.  The correct value, true or false,
 +                                          is set at each step, as it depends on the frequency of temperature
 +                                          and pressure control.*/
 +    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
 +
 +    if (bRerunMD)
 +    {
 +        /* Since we don't know if the frames read are related in any way,
 +         * rebuild the neighborlist at every step.
 +         */
 +        ir->nstlist       = 1;
 +        ir->nstcalcenergy = 1;
 +        nstglobalcomm     = 1;
 +    }
 +
 +    check_ir_old_tpx_versions(cr, fplog, ir, top_global);
 +
 +    nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
 +    bGStatEveryStep = (nstglobalcomm == 1);
 +
 +    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
 +    {
 +        fprintf(fplog,
 +                "To reduce the energy communication with nstlist = -1\n"
 +                "the neighbor list validity should not be checked at every step,\n"
 +                "this means that exact integration is not guaranteed.\n"
 +                "The neighbor list validity is checked after:\n"
 +                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
 +                "In most cases this will result in exact integration.\n"
 +                "This reduces the energy communication by a factor of 2 to 3.\n"
 +                "If you want less energy communication, set nstlist > 3.\n\n");
 +    }
 +
 +    if (bRerunMD || bFFscan)
 +    {
 +        ir->nstxtcout = 0;
 +    }
 +    groups = &top_global->groups;
 +
 +    /* Initial values */
 +    init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda,
 +            &(state_global->fep_state), lam0,
 +            nrnb, top_global, &upd,
 +            nfile, fnm, &outf, &mdebin,
 +            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, state_global, Flags);
 +
 +    clear_mat(total_vir);
 +    clear_mat(pres);
 +    /* Energy terms and groups */
 +    snew(enerd, 1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
 +                  enerd);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        f = NULL;
 +    }
 +    else
 +    {
 +        snew(f, top_global->natoms);
 +    }
 +
 +    /* lambda Monte carlo random number generator  */
 +    if (ir->bExpanded)
 +    {
 +        mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
 +    }
 +    /* copy the state into df_history */
 +    copy_df_history(&df_history, &state_global->dfhist);
 +
 +    /* Kinetic energy data */
 +    snew(ekind, 1);
 +    init_ekindata(fplog, top_global, &(ir->opts), ekind);
 +    /* needed for iteration of constraints */
 +    snew(ekind_save, 1);
 +    init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
 +    /* Copy the cos acceleration to the groups struct */
 +    ekind->cosacc.cos_accel = ir->cos_accel;
 +
 +    gstat = global_stat_init(ir);
 +    debug_gmx();
 +
 +    /* Check for polarizable models and flexible constraints */
 +    shellfc = init_shell_flexcon(fplog,
 +                                 top_global, n_flexible_constraints(constr),
 +                                 (ir->bContinuation ||
 +                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
 +                                 NULL : state_global->x);
 +
 +    if (DEFORM(*ir))
 +    {
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        set_deform_reference_box(upd,
 +                                 deform_init_init_step_tpx,
 +                                 deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    {
 +        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
 +        if ((io > 2000) && MASTER(cr))
 +        {
 +            fprintf(stderr,
 +                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
 +                    io);
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        top = dd_init_local_top(top_global);
 +
 +        snew(state, 1);
 +        dd_init_local_state(cr->dd, state_global, state);
 +
 +        if (DDMASTER(cr->dd) && ir->nstfout)
 +        {
 +            snew(f_global, state_global->natoms);
 +        }
 +    }
 +    else
 +    {
 +        if (PAR(cr))
 +        {
 +            /* Initialize the particle decomposition and split the topology */
 +            top = split_system(fplog, top_global, ir, cr);
 +
 +            pd_cg_range(cr, &fr->cg0, &fr->hcg);
 +            pd_at_range(cr, &a0, &a1);
 +        }
 +        else
 +        {
 +            top = gmx_mtop_generate_local_top(top_global, ir);
 +
 +            a0 = 0;
 +            a1 = top_global->natoms;
 +        }
 +
 +        forcerec_set_excl_load(fr, top, cr);
 +
 +        state    = partdec_init_local_state(cr, state_global);
 +        f_global = f;
 +
 +        atoms2md(top_global, ir, 0, NULL, a0, a1-a0, mdatoms);
 +
 +        if (vsite)
 +        {
 +            set_vsite_top(vsite, top, mdatoms, cr);
 +        }
 +
 +        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
 +        {
 +            graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE);
 +        }
 +
 +        if (shellfc)
 +        {
 +            make_local_shells(cr, mdatoms, shellfc);
 +        }
 +
 +        init_bonded_thread_force_reduction(fr, &top->idef);
 +
 +        if (ir->pull && PAR(cr))
 +        {
 +            dd_make_local_pull_groups(NULL, ir->pull, mdatoms);
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        /* Distribute the charge groups over the nodes from the master node */
 +        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
 +                            state_global, top_global, ir,
 +                            state, &f, mdatoms, top, fr,
 +                            vsite, shellfc, constr,
 +                            nrnb, wcycle, FALSE);
 +
 +    }
 +
 +    update_mdatoms(mdatoms, state->lambda[efptMASS]);
 +
 +    if (opt2bSet("-cpi", nfile, fnm))
 +    {
 +        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr);
 +    }
 +    else
 +    {
 +        bStateFromCP = FALSE;
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        if (bStateFromCP)
 +        {
 +            /* Update mdebin with energy history if appending to output files */
 +            if (Flags & MD_APPENDFILES)
 +            {
 +                restore_energyhistory_from_state(mdebin, &state_global->enerhist);
 +            }
 +            else
 +            {
 +                /* We might have read an energy history from checkpoint,
 +                 * free the allocated memory and reset the counts.
 +                 */
 +                done_energyhistory(&state_global->enerhist);
 +                init_energyhistory(&state_global->enerhist);
 +            }
 +        }
 +        /* Set the initial energy history in state by updating once */
 +        update_energyhistory(&state_global->enerhist, mdebin);
 +    }
 +
 +    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG))
 +    {
 +        /* Set the random state if we read a checkpoint file */
 +        set_stochd_state(upd, state);
 +    }
 +
 +    if (state->flags & (1<<estMC_RNG))
 +    {
 +        set_mc_state(mcrng, state);
 +    }
 +
 +    /* Initialize constraints */
 +    if (constr)
 +    {
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            set_constraints(constr, top, ir, mdatoms, cr);
 +        }
 +    }
 +
 +    /* Check whether we have to GCT stuff */
 +    bTCR = ftp2bSet(efGCT, nfile, fnm);
 +    if (bTCR)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "Will do General Coupling Theory!\n");
 +        }
 +        gnx = top_global->mols.nr;
 +        snew(grpindex, gnx);
 +        for (i = 0; (i < gnx); i++)
 +        {
 +            grpindex[i] = i;
 +        }
 +    }
 +
 +    if (repl_ex_nst > 0)
 +    {
 +        /* We need to be sure replica exchange can only occur
 +         * when the energies are current */
 +        check_nst_param(fplog, cr, "nstcalcenergy", ir->nstcalcenergy,
 +                        "repl_ex_nst", &repl_ex_nst);
 +        /* This check needs to happen before inter-simulation
 +         * signals are initialized, too */
 +    }
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
 +                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
 +    }
 +
 +    /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
 +     * With perturbed charges with soft-core we should not change the cut-off.
 +     */
 +    if ((Flags & MD_TUNEPME) &&
 +        EEL_PME(fr->eeltype) &&
 +        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
 +        !(ir->efep != efepNO && mdatoms->nChargePerturbed > 0 && ir->fepvals->bScCoul) &&
 +        !bRerunMD)
 +    {
 +        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
 +        cycles_pmes = 0;
 +        if (cr->duty & DUTY_PME)
 +        {
 +            /* Start tuning right away, as we can't measure the load */
 +            bPMETuneRunning = TRUE;
 +        }
 +        else
 +        {
 +            /* Separate PME nodes, we can measure the PP/PME load balance */
 +            bPMETuneTry = TRUE;
 +        }
 +    }
 +
 +    if (!ir->bContinuation && !bRerunMD)
 +    {
 +        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
 +        {
 +            /* Set the velocities of frozen particles to zero */
 +            for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +            {
 +                for (m = 0; m < DIM; m++)
 +                {
 +                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 +                    {
 +                        state->v[i][m] = 0;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (constr)
 +        {
 +            /* Constrain the initial coordinates and velocities */
 +            do_constrain_first(fplog, constr, ir, mdatoms, state, f,
 +                               graph, cr, nrnb, fr, top, shake_vir);
 +        }
 +        if (vsite)
 +        {
 +            /* Construct the virtual sites for the initial configuration */
 +            construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, NULL,
 +                             top->idef.iparams, top->idef.il,
 +                             fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +        }
 +    }
 +
 +    debug_gmx();
 +
 +    /* set free energy calculation frequency as the minimum of nstdhdl, nstexpanded, and nstrepl_ex_nst*/
 +    nstfep = ir->fepvals->nstdhdl;
 +    if (ir->bExpanded && (nstfep > ir->expandedvals->nstexpanded))
 +    {
 +        nstfep = ir->expandedvals->nstexpanded;
 +    }
 +    if (repl_ex_nst > 0 && nstfep > repl_ex_nst)
 +    {
 +        nstfep = repl_ex_nst;
 +    }
 +
 +    /* I'm assuming we need global communication the first time! MRS */
 +    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
 +                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
 +                  | (bVV ? CGLO_PRESSURE : 0)
 +                  | (bVV ? CGLO_CONSTRAINT : 0)
 +                  | (bRerunMD ? CGLO_RERUNMD : 0)
 +                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0));
 +
 +    bSumEkinhOld = FALSE;
 +    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                    NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                    constr, NULL, FALSE, state->box,
 +                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld, cglo_flags);
 +    if (ir->eI == eiVVAK)
 +    {
 +        /* a second call to get the half step temperature initialized as well */
 +        /* we do the same call as above, but turn the pressure off -- internally to
 +           compute_globals, this is recognized as a velocity verlet half-step
 +           kinetic energy calculation.  This minimized excess variables, but
 +           perhaps loses some logic?*/
 +
 +        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                        NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                        constr, NULL, FALSE, state->box,
 +                        top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                        cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE));
 +    }
 +
 +    /* Calculate the initial half step temperature, and save the ekinh_old */
 +    if (!(Flags & MD_STARTFROMCPT))
 +    {
 +        for (i = 0; (i < ir->opts.ngtc); i++)
 +        {
 +            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 +        }
 +    }
 +    if (ir->eI != eiVV)
 +    {
 +        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
 +                                     and there is no previous step */
 +    }
 +
 +    /* if using an iterative algorithm, we need to create a working directory for the state. */
 +    if (bIterativeCase)
 +    {
 +        bufstate = init_bufstate(state);
 +    }
 +    if (bFFscan)
 +    {
 +        snew(xcopy, state->natoms);
 +        snew(vcopy, state->natoms);
 +        copy_rvecn(state->x, xcopy, 0, state->natoms);
 +        copy_rvecn(state->v, vcopy, 0, state->natoms);
 +        copy_mat(state->box, boxcopy);
 +    }
 +
 +    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
 +       temperature control */
 +    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 +
 +    if (MASTER(cr))
 +    {
 +        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
 +        {
 +            fprintf(fplog,
 +                    "RMS relative constraint deviation after constraining: %.2e\n",
 +                    constr_rmsd(constr, FALSE));
 +        }
 +        if (EI_STATE_VELOCITY(ir->eI))
 +        {
 +            fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
 +        }
 +        if (bRerunMD)
 +        {
 +            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
 +                    " input trajectory '%s'\n\n",
 +                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "Calculated time to finish depends on nsteps from "
 +                        "run input file,\nwhich may not correspond to the time "
 +                        "needed to process input trajectory.\n\n");
 +            }
 +        }
 +        else
 +        {
 +            char tbuf[20];
 +            fprintf(stderr, "starting mdrun '%s'\n",
 +                    *(top_global->name));
 +            if (ir->nsteps >= 0)
 +            {
 +                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
 +            }
 +            else
 +            {
 +                sprintf(tbuf, "%s", "infinite");
 +            }
 +            if (ir->init_step > 0)
 +            {
 +                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 +                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
 +                        gmx_step_str(ir->init_step, sbuf2),
 +                        ir->init_step*ir->delta_t);
 +            }
 +            else
 +            {
 +                fprintf(stderr, "%s steps, %s ps.\n",
 +                        gmx_step_str(ir->nsteps, sbuf), tbuf);
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +
 +    /* Set and write start time */
 +    runtime_start(runtime);
 +    print_date_and_time(fplog, cr->nodeid, "Started mdrun", runtime);
 +    wallcycle_start(wcycle, ewcRUN);
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\n");
 +    }
 +
 +    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 +#ifdef GMX_FAHCORE
 +    chkpt_ret = fcCheckPointParallel( cr->nodeid,
 +                                      NULL, 0);
 +    if (chkpt_ret == 0)
 +    {
 +        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
 +    }
 +#endif
 +
 +    debug_gmx();
 +    /***********************************************************
 +     *
 +     *             Loop over MD steps
 +     *
 +     ************************************************************/
 +
 +    /* if rerunMD then read coordinates and velocities from input trajectory */
 +    if (bRerunMD)
 +    {
 +        if (getenv("GMX_FORCE_UPDATE"))
 +        {
 +            bForceUpdate = TRUE;
 +        }
 +
 +        rerun_fr.natoms = 0;
 +        if (MASTER(cr))
 +        {
 +            bNotLastFrame = read_first_frame(oenv, &status,
 +                                             opt2fn("-rerun", nfile, fnm),
 +                                             &rerun_fr, TRX_NEED_X | TRX_READ_V);
 +            if (rerun_fr.natoms != top_global->natoms)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Number of atoms in trajectory (%d) does not match the "
 +                          "run input file (%d)\n",
 +                          rerun_fr.natoms, top_global->natoms);
 +            }
 +            if (ir->ePBC != epbcNONE)
 +            {
 +                if (!rerun_fr.bBox)
 +                {
 +                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
 +                }
 +                if (max_cutoff2(ir->ePBC, rerun_fr.box) < sqr(fr->rlistlong))
 +                {
 +                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
 +                }
 +            }
 +        }
 +
 +        if (PAR(cr))
 +        {
 +            rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
 +        }
 +
 +        if (ir->ePBC != epbcNONE)
 +        {
 +            /* Set the shift vectors.
 +             * Necessary here when have a static box different from the tpr box.
 +             */
 +            calc_shifts(rerun_fr.box, fr->shift_vec);
 +        }
 +    }
 +
 +    /* loop over MD steps or if rerunMD to end of input trajectory */
 +    bFirstStep = TRUE;
 +    /* Skip the first Nose-Hoover integration when we get the state from tpx */
 +    bStateFromTPX    = !bStateFromCP;
 +    bInitStep        = bFirstStep && (bStateFromTPX || bVV);
 +    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
 +    bLastStep        = FALSE;
 +    bSumEkinhOld     = FALSE;
 +    bExchanged       = FALSE;
 +
 +    init_global_signals(&gs, cr, ir, repl_ex_nst);
 +
 +    step     = ir->init_step;
 +    step_rel = 0;
 +
 +    if (ir->nstlist == -1)
 +    {
 +        init_nlistheuristics(&nlh, bGStatEveryStep, step);
 +    }
 +
 +    if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
 +    {
 +        /* check how many steps are left in other sims */
 +        multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps);
 +    }
 +
 +
 +    /* and stop now if we should */
 +    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
 +                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
 +    while (!bLastStep || (bRerunMD && bNotLastFrame))
 +    {
 +
 +        wallcycle_start(wcycle, ewcSTEP);
 +
 +        if (bRerunMD)
 +        {
 +            if (rerun_fr.bStep)
 +            {
 +                step     = rerun_fr.step;
 +                step_rel = step - ir->init_step;
 +            }
 +            if (rerun_fr.bTime)
 +            {
 +                t = rerun_fr.time;
 +            }
 +            else
 +            {
 +                t = step;
 +            }
 +        }
 +        else
 +        {
 +            bLastStep = (step_rel == ir->nsteps);
 +            t         = t0 + step*ir->delta_t;
 +        }
 +
 +        if (ir->efep != efepNO || ir->bSimTemp)
 +        {
 +            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
 +               requiring different logic. */
 +
 +            set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
 +            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
 +            bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
 +            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
 +        }
 +
 +        if (bSimAnn)
 +        {
 +            update_annealing_target_temp(&(ir->opts), t);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
 +            {
 +                for (i = 0; i < state_global->natoms; i++)
 +                {
 +                    copy_rvec(rerun_fr.x[i], state_global->x[i]);
 +                }
 +                if (rerun_fr.bV)
 +                {
 +                    for (i = 0; i < state_global->natoms; i++)
 +                    {
 +                        copy_rvec(rerun_fr.v[i], state_global->v[i]);
 +                    }
 +                }
 +                else
 +                {
 +                    for (i = 0; i < state_global->natoms; i++)
 +                    {
 +                        clear_rvec(state_global->v[i]);
 +                    }
 +                    if (bRerunWarnNoV)
 +                    {
 +                        fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
 +                                "         Ekin, temperature and pressure are incorrect,\n"
 +                                "         the virial will be incorrect when constraints are present.\n"
 +                                "\n");
 +                        bRerunWarnNoV = FALSE;
 +                    }
 +                }
 +            }
 +            copy_mat(rerun_fr.box, state_global->box);
 +            copy_mat(state_global->box, state->box);
 +
 +            if (vsite && (Flags & MD_RERUN_VSITE))
 +            {
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
 +                }
 +                if (graph)
 +                {
 +                    /* Following is necessary because the graph may get out of sync
 +                     * with the coordinates if we only have every N'th coordinate set
 +                     */
 +                    mk_mshift(fplog, graph, fr->ePBC, state->box, state->x);
 +                    shift_self(graph, state->box, state->x);
 +                }
 +                construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
 +                                 top->idef.iparams, top->idef.il,
 +                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +                if (graph)
 +                {
 +                    unshift_self(graph, state->box, state->x);
 +                }
 +            }
 +        }
 +
 +        /* Stop Center of Mass motion */
 +        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
 +
 +        /* Copy back starting coordinates in case we're doing a forcefield scan */
 +        if (bFFscan)
 +        {
 +            for (ii = 0; (ii < state->natoms); ii++)
 +            {
 +                copy_rvec(xcopy[ii], state->x[ii]);
 +                copy_rvec(vcopy[ii], state->v[ii]);
 +            }
 +            copy_mat(boxcopy, state->box);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            /* for rerun MD always do Neighbour Searching */
 +            bNS      = (bFirstStep || ir->nstlist != 0);
 +            bNStList = bNS;
 +        }
 +        else
 +        {
 +            /* Determine whether or not to do Neighbour Searching and LR */
 +            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
 +
 +            bNS = (bFirstStep || bExchanged || bNStList || bDoFEP ||
 +                   (ir->nstlist == -1 && nlh.nabnsb > 0));
 +
 +            if (bNS && ir->nstlist == -1)
 +            {
 +                set_nlistheuristics(&nlh, bFirstStep || bExchanged || bDoFEP, step);
 +            }
 +        }
 +
 +        /* check whether we should stop because another simulation has
 +           stopped. */
 +        if (MULTISIM(cr))
 +        {
 +            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&
 +                 (multisim_nsteps != ir->nsteps) )
 +            {
 +                if (bNS)
 +                {
 +                    if (MASTER(cr))
 +                    {
 +                        fprintf(stderr,
 +                                "Stopping simulation %d because another one has finished\n",
 +                                cr->ms->sim);
 +                    }
 +                    bLastStep         = TRUE;
 +                    gs.sig[eglsCHKPT] = 1;
 +                }
 +            }
 +        }
 +
 +        /* < 0 means stop at next step, > 0 means stop at next NS step */
 +        if ( (gs.set[eglsSTOPCOND] < 0 ) ||
 +             ( (gs.set[eglsSTOPCOND] > 0 ) && ( bNS || ir->nstlist == 0)) )
 +        {
 +            bLastStep = TRUE;
 +        }
 +
 +        /* Determine whether or not to update the Born radii if doing GB */
 +        bBornRadii = bFirstStep;
 +        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
 +        {
 +            bBornRadii = TRUE;
 +        }
 +
 +        do_log     = do_per_step(step, ir->nstlog) || bFirstStep || bLastStep;
 +        do_verbose = bVerbose &&
 +            (step % stepout == 0 || bFirstStep || bLastStep);
 +
 +        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
 +        {
 +            if (bRerunMD)
 +            {
 +                bMasterState = TRUE;
 +            }
 +            else
 +            {
 +                bMasterState = FALSE;
 +                /* Correct the new box if it is too skewed */
 +                if (DYNAMIC_BOX(*ir))
 +                {
 +                    if (correct_box(fplog, step, state->box, graph))
 +                    {
 +                        bMasterState = TRUE;
 +                    }
 +                }
 +                if (DOMAINDECOMP(cr) && bMasterState)
 +                {
 +                    dd_collect_state(cr->dd, state, state_global);
 +                }
 +            }
 +
 +            if (DOMAINDECOMP(cr))
 +            {
 +                /* Repartition the domain decomposition */
 +                wallcycle_start(wcycle, ewcDOMDEC);
 +                dd_partition_system(fplog, step, cr,
 +                                    bMasterState, nstglobalcomm,
 +                                    state_global, top_global, ir,
 +                                    state, &f, mdatoms, top, fr,
 +                                    vsite, shellfc, constr,
 +                                    nrnb, wcycle,
 +                                    do_verbose && !bPMETuneRunning);
 +                wallcycle_stop(wcycle, ewcDOMDEC);
 +                /* If using an iterative integrator, reallocate space to match the decomposition */
 +            }
 +        }
 +
 +        if (MASTER(cr) && do_log && !bFFscan)
 +        {
 +            print_ebin_header(fplog, step, t, state->lambda[efptFEP]); /* can we improve the information printed here? */
 +        }
 +
 +        if (ir->efep != efepNO)
 +        {
 +            update_mdatoms(mdatoms, state->lambda[efptMASS]);
 +        }
 +
 +        if ((bRerunMD && rerun_fr.bV) || bExchanged)
 +        {
 +
 +            /* We need the kinetic energy at minus the half step for determining
 +             * the full step kinetic energy and possibly for T-coupling.*/
 +            /* This may not be quite working correctly yet . . . . */
 +            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
 +                            constr, NULL, FALSE, state->box,
 +                            top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +        }
 +        clear_mat(force_vir);
 +
 +        /* Ionize the atoms if necessary */
 +        if (bIonize)
 +        {
 +            ionize(fplog, oenv, mdatoms, top_global, t, ir, state->x, state->v,
 +                   mdatoms->start, mdatoms->start+mdatoms->homenr, state->box, cr);
 +        }
 +
 +        /* Update force field in ffscan program */
 +        if (bFFscan)
 +        {
 +            if (update_forcefield(fplog,
 +                                  nfile, fnm, fr,
 +                                  mdatoms->nr, state->x, state->box))
 +            {
 +                gmx_finalize_par();
 +
 +                exit(0);
 +            }
 +        }
 +
 +        /* We write a checkpoint at this MD step when:
 +         * either at an NS step when we signalled through gs,
 +         * or at the last step (but not when we do not want confout),
 +         * but never at the first step or with rerun.
 +         */
 +        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
 +                 (bLastStep && (Flags & MD_CONFOUT))) &&
 +                step > ir->init_step && !bRerunMD);
 +        if (bCPT)
 +        {
 +            gs.set[eglsCHKPT] = 0;
 +        }
 +
 +        /* Determine the energy and pressure:
 +         * at nstcalcenergy steps and at energy output steps (set below).
 +         */
 +        if (EI_VV(ir->eI) && (!bInitStep))
 +        {
 +            /* for vv, the first half of the integration actually corresponds
 +               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
 +               but the virial needs to be calculated on both the current step and the 'next' step. Future
 +               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
 +
 +            bCalcEner = do_per_step(step-1, ir->nstcalcenergy);
 +            bCalcVir  = bCalcEner ||
 +                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
 +        }
 +        else
 +        {
 +            bCalcEner = do_per_step(step, ir->nstcalcenergy);
 +            bCalcVir  = bCalcEner ||
 +                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
 +        }
 +
 +        /* Do we need global communication ? */
 +        bGStat = (bCalcVir || bCalcEner || bStopCM ||
 +                  do_per_step(step, nstglobalcomm) || (bVV && IR_NVT_TROTTER(ir) && do_per_step(step-1, nstglobalcomm)) ||
 +                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
 +
 +        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
 +
 +        if (do_ene || do_log)
 +        {
 +            bCalcVir  = TRUE;
 +            bCalcEner = TRUE;
 +            bGStat    = TRUE;
 +        }
 +
 +        /* these CGLO_ options remain the same throughout the iteration */
 +        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
 +                      (bGStat ? CGLO_GSTAT : 0)
 +                      );
 +
 +        force_flags = (GMX_FORCE_STATECHANGED |
 +                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
 +                       GMX_FORCE_ALLFORCES |
 +                       GMX_FORCE_SEPLRF |
 +                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
 +                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
 +                       (bDoFEP ? GMX_FORCE_DHDL : 0)
 +                       );
 +
 +        if (fr->bTwinRange)
 +        {
 +            if (do_per_step(step, ir->nstcalclr))
 +            {
 +                force_flags |= GMX_FORCE_DO_LR;
 +            }
 +        }
 +
 +        if (shellfc)
 +        {
 +            /* Now is the time to relax the shells */
 +            count = relax_shell_flexcon(fplog, cr, bVerbose, bFFscan ? step+1 : step,
 +                                        ir, bNS, force_flags,
 +                                        bStopCM, top, top_global,
 +                                        constr, enerd, fcd,
 +                                        state, f, force_vir, mdatoms,
 +                                        nrnb, wcycle, graph, groups,
 +                                        shellfc, fr, bBornRadii, t, mu_tot,
 +                                        state->natoms, &bConverged, vsite,
 +                                        outf->fp_field);
 +            tcount += count;
 +
 +            if (bConverged)
 +            {
 +                nconverged++;
 +            }
 +        }
 +        else
 +        {
 +            /* The coordinates (x) are shifted (to get whole molecules)
 +             * in do_force.
 +             * This is parallellized as well, and does communication too.
 +             * Check comments in sim_util.c
 +             */
 +            do_force(fplog, cr, ir, step, nrnb, wcycle, top, top_global, groups,
 +                     state->box, state->x, &state->hist,
 +                     f, force_vir, mdatoms, enerd, fcd,
 +                     state->lambda, graph,
 +                     fr, vsite, mu_tot, t, outf->fp_field, ed, bBornRadii,
 +                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
 +        }
 +
 +        if (bTCR)
 +        {
 +            mu_aver = calc_mu_aver(cr, state->x, mdatoms->chargeA,
 +                                   mu_tot, &top_global->mols, mdatoms, gnx, grpindex);
 +        }
 +
 +        if (bTCR && bFirstStep)
 +        {
 +            tcr = init_coupling(fplog, nfile, fnm, cr, fr, mdatoms, &(top->idef));
 +            fprintf(fplog, "Done init_coupling\n");
 +            fflush(fplog);
 +        }
 +
 +        if (bVV && !bStartingFromCpt && !bRerunMD)
 +        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 +        {
 +            if (ir->eI == eiVV && bInitStep)
 +            {
 +                /* if using velocity verlet with full time step Ekin,
 +                 * take the first half step only to compute the
 +                 * virial for the first step. From there,
 +                 * revert back to the initial coordinates
 +                 * so that the input is actually the initial step.
 +                 */
 +                copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */
 +            }
 +            else
 +            {
 +                /* this is for NHC in the Ekin(t+dt/2) version of vv */
 +                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
 +            }
 +
 +            /* If we are using twin-range interactions where the long-range component
 +             * is only evaluated every nstcalclr>1 steps, we should do a special update
 +             * step to combine the long-range forces on these steps.
 +             * For nstcalclr=1 this is not done, since the forces would have been added
 +             * directly to the short-range forces already.
 +             */
 +            bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +            update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC,
 +                          f, bUpdateDoLR, fr->f_twin, fcd,
 +                          ekind, M, wcycle, upd, bInitStep, etrtVELOCITY1,
 +                          cr, nrnb, constr, &top->idef);
 +
 +            if (bIterativeCase && do_per_step(step-1, ir->nstpcouple) && !bInitStep)
 +            {
 +                gmx_iterate_init(&iterate, TRUE);
 +            }
 +            /* for iterations, we save these vectors, as we will be self-consistently iterating
 +               the calculations */
 +
 +            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
 +
 +            /* save the state */
 +            if (iterate.bIterationActive)
 +            {
 +                copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
 +            }
 +
 +            bFirstIterate = TRUE;
 +            while (bFirstIterate || iterate.bIterationActive)
 +            {
 +                if (iterate.bIterationActive)
 +                {
 +                    copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
 +                    if (bFirstIterate && bTrotter)
 +                    {
 +                        /* The first time through, we need a decent first estimate
 +                           of veta(t+dt) to compute the constraints.  Do
 +                           this by computing the box volume part of the
 +                           trotter integration at this time. Nothing else
 +                           should be changed by this routine here.  If
 +                           !(first time), we start with the previous value
 +                           of veta.  */
 +
 +                        veta_save = state->veta;
 +                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ0);
 +                        vetanew     = state->veta;
 +                        state->veta = veta_save;
 +                    }
 +                }
 +
 +                bOK = TRUE;
 +                if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
 +                {
-                 enerd->term[F_DVDL_BONDED] += dvdl;        /* only add after iterations */
++                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
 +                                       state, fr->bMolPBC, graph, f,
 +                                       &top->idef, shake_vir, NULL,
 +                                       cr, nrnb, wcycle, upd, constr,
 +                                       bInitStep, TRUE, bCalcVir, vetanew);
 +
 +                    if (!bOK && !bFFscan)
 +                    {
 +                        gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                    }
 +
 +                }
 +                else if (graph)
 +                {
 +                    /* Need to unshift here if a do_force has been
 +                       called in the previous step */
 +                    unshift_self(graph, state->box, state->x);
 +                }
 +
 +                /* if VV, compute the pressure and constraints */
 +                /* For VV2, we strictly only need this if using pressure
 +                 * control, but we really would like to have accurate pressures
 +                 * printed out.
 +                 * Think about ways around this in the future?
 +                 * For now, keep this choice in comments.
 +                 */
 +                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
 +                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
 +                bPres = TRUE;
 +                bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
 +                if (bCalcEner && ir->eI == eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
 +                {
 +                    bSumEkinhOld = TRUE;
 +                }
 +                /* for vv, the first half of the integration actually corresponds to the previous step.
 +                   So we need information from the last step in the first half of the integration */
 +                if (bGStat || do_per_step(step-1, nstglobalcomm))
 +                {
 +                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                                    constr, NULL, FALSE, state->box,
 +                                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                                    cglo_flags
 +                                    | CGLO_ENERGY
 +                                    | (bTemp ? CGLO_TEMPERATURE : 0)
 +                                    | (bPres ? CGLO_PRESSURE : 0)
 +                                    | (bPres ? CGLO_CONSTRAINT : 0)
 +                                    | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
 +                                    | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                                    | CGLO_SCALEEKIN
 +                                    );
 +                    /* explanation of above:
 +                       a) We compute Ekin at the full time step
 +                       if 1) we are using the AveVel Ekin, and it's not the
 +                       initial step, or 2) if we are using AveEkin, but need the full
 +                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
 +                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
 +                       EkinAveVel because it's needed for the pressure */
 +                }
 +                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
 +                if (!bInitStep)
 +                {
 +                    if (bTrotter)
 +                    {
 +                        m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
 +                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
 +                    }
 +                    else
 +                    {
 +                        if (bExchanged)
 +                        {
 +
 +                            /* We need the kinetic energy at minus the half step for determining
 +                             * the full step kinetic energy and possibly for T-coupling.*/
 +                            /* This may not be quite working correctly yet . . . . */
 +                            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
 +                                            constr, NULL, FALSE, state->box,
 +                                            top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +                        }
 +                    }
 +                }
 +
 +                if (iterate.bIterationActive &&
 +                    done_iterating(cr, fplog, step, &iterate, bFirstIterate,
 +                                   state->veta, &vetanew))
 +                {
 +                    break;
 +                }
 +                bFirstIterate = FALSE;
 +            }
 +
 +            if (bTrotter && !bInitStep)
 +            {
-             if (fr->bSepDVDL && fplog && do_log)
-             {
-                 fprintf(fplog, sepdvdlformat, "Constraint", 0.0, dvdl);
-             }
-             enerd->term[F_DVDL_BONDED] += dvdl;
 +                copy_mat(shake_vir, state->svir_prev);
 +                copy_mat(force_vir, state->fvir_prev);
 +                if (IR_NVT_TROTTER(ir) && ir->eI == eiVV)
 +                {
 +                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
 +                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE, FALSE);
 +                    enerd->term[F_EKIN] = trace(ekind->ekin);
 +                }
 +            }
 +            /* if it's the initial step, we performed this first step just to get the constraint virial */
 +            if (bInitStep && ir->eI == eiVV)
 +            {
 +                copy_rvecn(cbuf, state->v, 0, state->natoms);
 +            }
-                     update_constraints(fplog, step, &dvdl, ir, ekind, mdatoms,
 +        }
 +
 +        /* MRS -- now done iterating -- compute the conserved quantity */
 +        if (bVV)
 +        {
 +            saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
 +            if (ir->eI == eiVV)
 +            {
 +                last_ekin = enerd->term[F_EKIN];
 +            }
 +            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
 +            {
 +                saved_conserved_quantity -= enerd->term[F_DISPCORR];
 +            }
 +            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
 +            if (!bRerunMD)
 +            {
 +                sum_dhdl(enerd, state->lambda, ir->fepvals);
 +            }
 +        }
 +
 +        /* ########  END FIRST UPDATE STEP  ############## */
 +        /* ########  If doing VV, we now have v(dt) ###### */
 +        if (bDoExpanded)
 +        {
 +            /* perform extended ensemble sampling in lambda - we don't
 +               actually move to the new state before outputting
 +               statistics, but if performing simulated tempering, we
 +               do update the velocities and the tau_t. */
 +
 +            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, &df_history, step, mcrng, state->v, mdatoms);
 +        }
 +        /* ################## START TRAJECTORY OUTPUT ################# */
 +
 +        /* Now we have the energies and forces corresponding to the
 +         * coordinates at time t. We must output all of this before
 +         * the update.
 +         * for RerunMD t is read from input trajectory
 +         */
 +        mdof_flags = 0;
 +        if (do_per_step(step, ir->nstxout))
 +        {
 +            mdof_flags |= MDOF_X;
 +        }
 +        if (do_per_step(step, ir->nstvout))
 +        {
 +            mdof_flags |= MDOF_V;
 +        }
 +        if (do_per_step(step, ir->nstfout))
 +        {
 +            mdof_flags |= MDOF_F;
 +        }
 +        if (do_per_step(step, ir->nstxtcout))
 +        {
 +            mdof_flags |= MDOF_XTC;
 +        }
 +        if (bCPT)
 +        {
 +            mdof_flags |= MDOF_CPT;
 +        }
 +        ;
 +
 +#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
 +        if (bLastStep)
 +        {
 +            /* Enforce writing positions and velocities at end of run */
 +            mdof_flags |= (MDOF_X | MDOF_V);
 +        }
 +#endif
 +#ifdef GMX_FAHCORE
 +        if (MASTER(cr))
 +        {
 +            fcReportProgress( ir->nsteps, step );
 +        }
 +
 +        /* sync bCPT and fc record-keeping */
 +        if (bCPT && MASTER(cr))
 +        {
 +            fcRequestCheckPoint();
 +        }
 +#endif
 +
 +        if (mdof_flags != 0)
 +        {
 +            wallcycle_start(wcycle, ewcTRAJ);
 +            if (bCPT)
 +            {
 +                if (state->flags & (1<<estLD_RNG))
 +                {
 +                    get_stochd_state(upd, state);
 +                }
 +                if (state->flags  & (1<<estMC_RNG))
 +                {
 +                    get_mc_state(mcrng, state);
 +                }
 +                if (MASTER(cr))
 +                {
 +                    if (bSumEkinhOld)
 +                    {
 +                        state_global->ekinstate.bUpToDate = FALSE;
 +                    }
 +                    else
 +                    {
 +                        update_ekinstate(&state_global->ekinstate, ekind);
 +                        state_global->ekinstate.bUpToDate = TRUE;
 +                    }
 +                    update_energyhistory(&state_global->enerhist, mdebin);
 +                    if (ir->efep != efepNO || ir->bSimTemp)
 +                    {
 +                        state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
 +                                                                       structured so this isn't necessary.
 +                                                                       Note this reassignment is only necessary
 +                                                                       for single threads.*/
 +                        copy_df_history(&state_global->dfhist, &df_history);
 +                    }
 +                }
 +            }
 +            write_traj(fplog, cr, outf, mdof_flags, top_global,
 +                       step, t, state, state_global, f, f_global, &n_xtc, &x_xtc);
 +            if (bCPT)
 +            {
 +                nchkpt++;
 +                bCPT = FALSE;
 +            }
 +            debug_gmx();
 +            if (bLastStep && step_rel == ir->nsteps &&
 +                (Flags & MD_CONFOUT) && MASTER(cr) &&
 +                !bRerunMD && !bFFscan)
 +            {
 +                /* x and v have been collected in write_traj,
 +                 * because a checkpoint file will always be written
 +                 * at the last step.
 +                 */
 +                fprintf(stderr, "\nWriting final coordinates.\n");
 +                if (fr->bMolPBC)
 +                {
 +                    /* Make molecules whole only for confout writing */
 +                    do_pbc_mtop(fplog, ir->ePBC, state->box, top_global, state_global->x);
 +                }
 +                write_sto_conf_mtop(ftp2fn(efSTO, nfile, fnm),
 +                                    *top_global->name, top_global,
 +                                    state_global->x, state_global->v,
 +                                    ir->ePBC, state->box);
 +                debug_gmx();
 +            }
 +            wallcycle_stop(wcycle, ewcTRAJ);
 +        }
 +
 +        /* kludge -- virial is lost with restart for NPT control. Must restart */
 +        if (bStartingFromCpt && bVV)
 +        {
 +            copy_mat(state->svir_prev, shake_vir);
 +            copy_mat(state->fvir_prev, force_vir);
 +        }
 +        /*  ################## END TRAJECTORY OUTPUT ################ */
 +
 +        /* Determine the wallclock run time up till now */
 +        run_time = gmx_gettime() - (double)runtime->real;
 +
 +        /* Check whether everything is still allright */
 +        if (((int)gmx_get_stop_condition() > handled_stop_condition)
 +#ifdef GMX_THREAD_MPI
 +            && MASTER(cr)
 +#endif
 +            )
 +        {
 +            /* this is just make gs.sig compatible with the hack
 +               of sending signals around by MPI_Reduce with together with
 +               other floats */
 +            if (gmx_get_stop_condition() == gmx_stop_cond_next_ns)
 +            {
 +                gs.sig[eglsSTOPCOND] = 1;
 +            }
 +            if (gmx_get_stop_condition() == gmx_stop_cond_next)
 +            {
 +                gs.sig[eglsSTOPCOND] = -1;
 +            }
 +            /* < 0 means stop at next step, > 0 means stop at next NS step */
 +            if (fplog)
 +            {
 +                fprintf(fplog,
 +                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                        gmx_get_signal_name(),
 +                        gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
 +                fflush(fplog);
 +            }
 +            fprintf(stderr,
 +                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                    gmx_get_signal_name(),
 +                    gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
 +            fflush(stderr);
 +            handled_stop_condition = (int)gmx_get_stop_condition();
 +        }
 +        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
 +                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
 +                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
 +        {
 +            /* Signal to terminate the run */
 +            gs.sig[eglsSTOPCOND] = 1;
 +            if (fplog)
 +            {
 +                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
 +            }
 +            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
 +        }
 +
 +        if (bResetCountersHalfMaxH && MASTER(cr) &&
 +            run_time > max_hours*60.0*60.0*0.495)
 +        {
 +            gs.sig[eglsRESETCOUNTERS] = 1;
 +        }
 +
 +        if (ir->nstlist == -1 && !bRerunMD)
 +        {
 +            /* When bGStatEveryStep=FALSE, global_stat is only called
 +             * when we check the atom displacements, not at NS steps.
 +             * This means that also the bonded interaction count check is not
 +             * performed immediately after NS. Therefore a few MD steps could
 +             * be performed with missing interactions.
 +             * But wrong energies are never written to file,
 +             * since energies are only written after global_stat
 +             * has been called.
 +             */
 +            if (step >= nlh.step_nscheck)
 +            {
 +                nlh.nabnsb = natoms_beyond_ns_buffer(ir, fr, &top->cgs,
 +                                                     nlh.scale_tot, state->x);
 +            }
 +            else
 +            {
 +                /* This is not necessarily true,
 +                 * but step_nscheck is determined quite conservatively.
 +                 */
 +                nlh.nabnsb = 0;
 +            }
 +        }
 +
 +        /* In parallel we only have to check for checkpointing in steps
 +         * where we do global communication,
 +         *  otherwise the other nodes don't know.
 +         */
 +        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
 +                           cpt_period >= 0 &&
 +                           (cpt_period == 0 ||
 +                            run_time >= nchkpt*cpt_period*60.0)) &&
 +            gs.set[eglsCHKPT] == 0)
 +        {
 +            gs.sig[eglsCHKPT] = 1;
 +        }
 +
 +        /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
 +        if (EI_VV(ir->eI))
 +        {
 +            if (!bInitStep)
 +            {
 +                update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
 +            }
 +            if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
 +            {
 +                gmx_bool bIfRandomize;
 +                bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr);
 +                /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
 +                if (constr && bIfRandomize)
 +                {
-                 dvdl = 0;
++                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
 +                                       state, fr->bMolPBC, graph, f,
 +                                       &top->idef, tmp_vir, NULL,
 +                                       cr, nrnb, wcycle, upd, constr,
 +                                       bInitStep, TRUE, bCalcVir, vetanew);
 +                }
 +            }
 +        }
 +
 +        if (bIterativeCase && do_per_step(step, ir->nstpcouple))
 +        {
 +            gmx_iterate_init(&iterate, TRUE);
 +            /* for iterations, we save these vectors, as we will be redoing the calculations */
 +            copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
 +        }
 +
 +        bFirstIterate = TRUE;
 +        while (bFirstIterate || iterate.bIterationActive)
 +        {
 +            /* We now restore these vectors to redo the calculation with improved extended variables */
 +            if (iterate.bIterationActive)
 +            {
 +                copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
 +            }
 +
 +            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
 +               so scroll down for that logic */
 +
 +            /* #########   START SECOND UPDATE STEP ################# */
 +            /* Box is changed in update() when we do pressure coupling,
 +             * but we should still use the old box for energy corrections and when
 +             * writing it to the energy file, so it matches the trajectory files for
 +             * the same timestep above. Make a copy in a separate array.
 +             */
 +            copy_mat(state->box, lastbox);
 +
 +            bOK = TRUE;
++            dvdl_constr = 0;
++
 +            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
 +            {
 +                wallcycle_start(wcycle, ewcUPDATE);
-                 update_constraints(fplog, step, &dvdl, ir, ekind, mdatoms, state,
 +                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
 +                if (bTrotter)
 +                {
 +                    if (iterate.bIterationActive)
 +                    {
 +                        if (bFirstIterate)
 +                        {
 +                            scalevir = 1;
 +                        }
 +                        else
 +                        {
 +                            /* we use a new value of scalevir to converge the iterations faster */
 +                            scalevir = tracevir/trace(shake_vir);
 +                        }
 +                        msmul(shake_vir, scalevir, shake_vir);
 +                        m_add(force_vir, shake_vir, total_vir);
 +                        clear_mat(shake_vir);
 +                    }
 +                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
 +                    /* We can only do Berendsen coupling after we have summed
 +                     * the kinetic energy or virial. Since the happens
 +                     * in global_state after update, we should only do it at
 +                     * step % nstlist = 1 with bGStatEveryStep=FALSE.
 +                     */
 +                }
 +                else
 +                {
 +                    update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
 +                    update_pcouple(fplog, step, ir, state, pcoupl_mu, M, wcycle,
 +                                   upd, bInitStep);
 +                }
 +
 +                if (bVV)
 +                {
 +                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +                    /* velocity half-step update */
 +                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
 +                                  bUpdateDoLR, fr->f_twin, fcd,
 +                                  ekind, M, wcycle, upd, FALSE, etrtVELOCITY2,
 +                                  cr, nrnb, constr, &top->idef);
 +                }
 +
 +                /* Above, initialize just copies ekinh into ekin,
 +                 * it doesn't copy position (for VV),
 +                 * and entire integrator for MD.
 +                 */
 +
 +                if (ir->eI == eiVVAK)
 +                {
 +                    copy_rvecn(state->x, cbuf, 0, state->natoms);
 +                }
 +                bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +                update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
 +                              bUpdateDoLR, fr->f_twin, fcd,
 +                              ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
 +                wallcycle_stop(wcycle, ewcUPDATE);
 +
-                     update_constraints(fplog, step, &dvdl, ir, ekind, mdatoms,
++                update_constraints(fplog, step, &dvdl_constr, ir, ekind, mdatoms, state,
 +                                   fr->bMolPBC, graph, f,
 +                                   &top->idef, shake_vir, force_vir,
 +                                   cr, nrnb, wcycle, upd, constr,
 +                                   bInitStep, FALSE, bCalcVir, state->veta);
 +
 +                if (ir->eI == eiVVAK)
 +                {
 +                    /* erase F_EKIN and F_TEMP here? */
 +                    /* just compute the kinetic energy at the half step to perform a trotter step */
 +                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                                    constr, NULL, FALSE, lastbox,
 +                                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                                    cglo_flags | CGLO_TEMPERATURE
 +                                    );
 +                    wallcycle_start(wcycle, ewcUPDATE);
 +                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
 +                    /* now we know the scaling, we can compute the positions again again */
 +                    copy_rvecn(cbuf, state->x, 0, state->natoms);
 +
 +                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
 +                                  bUpdateDoLR, fr->f_twin, fcd,
 +                                  ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
 +                    wallcycle_stop(wcycle, ewcUPDATE);
 +
 +                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
 +                    /* are the small terms in the shake_vir here due
 +                     * to numerical errors, or are they important
 +                     * physically? I'm thinking they are just errors, but not completely sure.
 +                     * For now, will call without actually constraining, constr=NULL*/
-                     fprintf(fplog, sepdvdlformat, "Constraint dV/dl", 0.0, dvdl);
++                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
 +                                       state, fr->bMolPBC, graph, f,
 +                                       &top->idef, tmp_vir, force_vir,
 +                                       cr, nrnb, wcycle, upd, NULL,
 +                                       bInitStep, FALSE, bCalcVir,
 +                                       state->veta);
 +                }
 +                if (!bOK && !bFFscan)
 +                {
 +                    gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                }
 +
 +                if (fr->bSepDVDL && fplog && do_log)
 +                {
-                 enerd->term[F_DVDL_BONDED] += dvdl;
++                    fprintf(fplog, sepdvdlformat, "Constraint dV/dl", 0.0, dvdl_constr);
 +                }
-         enerd->term[F_DVDL_BONDED] += dvdl;
++                enerd->term[F_DVDL_CONSTR] += dvdl_constr;
 +            }
 +            else if (graph)
 +            {
 +                /* Need to unshift here */
 +                unshift_self(graph, state->box, state->x);
 +            }
 +
 +            if (vsite != NULL)
 +            {
 +                wallcycle_start(wcycle, ewcVSITECONSTR);
 +                if (graph != NULL)
 +                {
 +                    shift_self(graph, state->box, state->x);
 +                }
 +                construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
 +                                 top->idef.iparams, top->idef.il,
 +                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +
 +                if (graph != NULL)
 +                {
 +                    unshift_self(graph, state->box, state->x);
 +                }
 +                wallcycle_stop(wcycle, ewcVSITECONSTR);
 +            }
 +
 +            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
 +            /* With Leap-Frog we can skip compute_globals at
 +             * non-communication steps, but we need to calculate
 +             * the kinetic energy one step before communication.
 +             */
 +            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
 +            {
 +                if (ir->nstlist == -1 && bFirstIterate)
 +                {
 +                    gs.sig[eglsNABNSB] = nlh.nabnsb;
 +                }
 +                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                                constr,
 +                                bFirstIterate ? &gs : NULL,
 +                                (step_rel % gs.nstms == 0) &&
 +                                (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
 +                                lastbox,
 +                                top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                                cglo_flags
 +                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
 +                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
 +                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
 +                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
 +                                | (iterate.bIterationActive ? CGLO_ITERATE : 0)
 +                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                                | CGLO_CONSTRAINT
 +                                );
 +                if (ir->nstlist == -1 && bFirstIterate)
 +                {
 +                    nlh.nabnsb         = gs.set[eglsNABNSB];
 +                    gs.set[eglsNABNSB] = 0;
 +                }
 +            }
 +            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
 +            /* #############  END CALC EKIN AND PRESSURE ################# */
 +
 +            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
 +               the virial that should probably be addressed eventually. state->veta has better properies,
 +               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
 +               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
 +
 +            if (iterate.bIterationActive &&
 +                done_iterating(cr, fplog, step, &iterate, bFirstIterate,
 +                               trace(shake_vir), &tracevir))
 +            {
 +                break;
 +            }
 +            bFirstIterate = FALSE;
 +        }
 +
 +        /* only add constraint dvdl after constraints */
++        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
 +        if (!bVV || bRerunMD)
 +        {
 +            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
 +            sum_dhdl(enerd, state->lambda, ir->fepvals);
 +        }
 +        update_box(fplog, step, ir, mdatoms, state, graph, f,
 +                   ir->nstlist == -1 ? &nlh.scale_tot : NULL, pcoupl_mu, nrnb, wcycle, upd, bInitStep, FALSE);
 +
 +        /* ################# END UPDATE STEP 2 ################# */
 +        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
 +
 +        /* The coordinates (x) were unshifted in update */
 +        if (bFFscan && (shellfc == NULL || bConverged))
 +        {
 +            if (print_forcefield(fplog, enerd->term, mdatoms->homenr,
 +                                 f, NULL, xcopy,
 +                                 &(top_global->mols), mdatoms->massT, pres))
 +            {
 +                gmx_finalize_par();
 +
 +                fprintf(stderr, "\n");
 +                exit(0);
 +            }
 +        }
 +        if (!bGStat)
 +        {
 +            /* We will not sum ekinh_old,
 +             * so signal that we still have to do it.
 +             */
 +            bSumEkinhOld = TRUE;
 +        }
 +
 +        if (bTCR)
 +        {
 +            /* Only do GCT when the relaxation of shells (minimization) has converged,
 +             * otherwise we might be coupling to bogus energies.
 +             * In parallel we must always do this, because the other sims might
 +             * update the FF.
 +             */
 +
 +            /* Since this is called with the new coordinates state->x, I assume
 +             * we want the new box state->box too. / EL 20040121
 +             */
 +            do_coupling(fplog, oenv, nfile, fnm, tcr, t, step, enerd->term, fr,
 +                        ir, MASTER(cr),
 +                        mdatoms, &(top->idef), mu_aver,
 +                        top_global->mols.nr, cr,
 +                        state->box, total_vir, pres,
 +                        mu_tot, state->x, f, bConverged);
 +            debug_gmx();
 +        }
 +
 +        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
 +
 +        /* use the directly determined last velocity, not actually the averaged half steps */
 +        if (bTrotter && ir->eI == eiVV)
 +        {
 +            enerd->term[F_EKIN] = last_ekin;
 +        }
 +        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
 +
 +        if (bVV)
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
 +        }
 +        else
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ);
 +        }
 +        /* Check for excessively large energies */
 +        if (bIonize)
 +        {
 +#ifdef GMX_DOUBLE
 +            real etot_max = 1e200;
 +#else
 +            real etot_max = 1e30;
 +#endif
 +            if (fabs(enerd->term[F_ETOT]) > etot_max)
 +            {
 +                fprintf(stderr, "Energy too large (%g), giving up\n",
 +                        enerd->term[F_ETOT]);
 +            }
 +        }
 +        /* #########  END PREPARING EDR OUTPUT  ###########  */
 +
 +        /* Time for performance */
 +        if (((step % stepout) == 0) || bLastStep)
 +        {
 +            runtime_upd_proc(runtime);
 +        }
 +
 +        /* Output stuff */
 +        if (MASTER(cr))
 +        {
 +            gmx_bool do_dr, do_or;
 +
 +            if (fplog && do_log && bDoExpanded)
 +            {
 +                /* only needed if doing expanded ensemble */
 +                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
 +                                          &df_history, state->fep_state, ir->nstlog, step);
 +            }
 +            if (!(bStartingFromCpt && (EI_VV(ir->eI))))
 +            {
 +                if (bCalcEner)
 +                {
 +                    upd_mdebin(mdebin, bDoDHDL, TRUE,
 +                               t, mdatoms->tmass, enerd, state,
 +                               ir->fepvals, ir->expandedvals, lastbox,
 +                               shake_vir, force_vir, total_vir, pres,
 +                               ekind, mu_tot, constr);
 +                }
 +                else
 +                {
 +                    upd_mdebin_step(mdebin);
 +                }
 +
 +                do_dr  = do_per_step(step, ir->nstdisreout);
 +                do_or  = do_per_step(step, ir->nstorireout);
 +
 +                print_ebin(outf->fp_ene, do_ene, do_dr, do_or, do_log ? fplog : NULL,
 +                           step, t,
 +                           eprNORMAL, bCompact, mdebin, fcd, groups, &(ir->opts));
 +            }
 +            if (ir->ePull != epullNO)
 +            {
 +                pull_print_output(ir->pull, step, t);
 +            }
 +
 +            if (do_per_step(step, ir->nstlog))
 +            {
 +                if (fflush(fplog) != 0)
 +                {
 +                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
 +                }
 +            }
 +        }
 +        if (bDoExpanded)
 +        {
 +            /* Have to do this part after outputting the logfile and the edr file */
 +            state->fep_state = lamnew;
 +            for (i = 0; i < efptNR; i++)
 +            {
 +                state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
 +            }
 +        }
 +        /* Remaining runtime */
 +        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
 +        {
 +            if (shellfc)
 +            {
 +                fprintf(stderr, "\n");
 +            }
 +            print_time(stderr, runtime, step, ir, cr);
 +        }
 +
 +        /* Replica exchange */
 +        bExchanged = FALSE;
 +        if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
 +            do_per_step(step, repl_ex_nst))
 +        {
 +            bExchanged = replica_exchange(fplog, cr, repl_ex,
 +                                          state_global, enerd,
 +                                          state, step, t);
 +
 +            if (bExchanged && DOMAINDECOMP(cr))
 +            {
 +                dd_partition_system(fplog, step, cr, TRUE, 1,
 +                                    state_global, top_global, ir,
 +                                    state, &f, mdatoms, top, fr,
 +                                    vsite, shellfc, constr,
 +                                    nrnb, wcycle, FALSE);
 +            }
 +        }
 +
 +        bFirstStep       = FALSE;
 +        bInitStep        = FALSE;
 +        bStartingFromCpt = FALSE;
 +
 +        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
 +        /* With all integrators, except VV, we need to retain the pressure
 +         * at the current step for coupling at the next step.
 +         */
 +        if ((state->flags & (1<<estPRES_PREV)) &&
 +            (bGStatEveryStep ||
 +             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
 +        {
 +            /* Store the pressure in t_state for pressure coupling
 +             * at the next MD step.
 +             */
 +            copy_mat(pres, state->pres_prev);
 +        }
 +
 +        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
 +
 +        if ( (membed != NULL) && (!bLastStep) )
 +        {
 +            rescale_membed(step_rel, membed, state_global->x);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            if (MASTER(cr))
 +            {
 +                /* read next frame from input trajectory */
 +                bNotLastFrame = read_next_frame(oenv, status, &rerun_fr);
 +            }
 +
 +            if (PAR(cr))
 +            {
 +                rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
 +            }
 +        }
 +
 +        if (!bRerunMD || !rerun_fr.bStep)
 +        {
 +            /* increase the MD step number */
 +            step++;
 +            step_rel++;
 +        }
 +
 +        cycles = wallcycle_stop(wcycle, ewcSTEP);
 +        if (DOMAINDECOMP(cr) && wcycle)
 +        {
 +            dd_cycles_add(cr->dd, cycles, ddCyclStep);
 +        }
 +
 +        if (bPMETuneRunning || bPMETuneTry)
 +        {
 +            /* PME grid + cut-off optimization with GPUs or PME nodes */
 +
 +            /* Count the total cycles over the last steps */
 +            cycles_pmes += cycles;
 +
 +            /* We can only switch cut-off at NS steps */
 +            if (step % ir->nstlist == 0)
 +            {
 +                /* PME grid + cut-off optimization with GPUs or PME nodes */
 +                if (bPMETuneTry)
 +                {
 +                    if (DDMASTER(cr->dd))
 +                    {
 +                        /* PME node load is too high, start tuning */
 +                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
 +                    }
 +                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
 +
 +                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
 +                    {
 +                        bPMETuneTry     = FALSE;
 +                    }
 +                }
 +                if (bPMETuneRunning)
 +                {
 +                    /* init_step might not be a multiple of nstlist,
 +                     * but the first cycle is always skipped anyhow.
 +                     */
 +                    bPMETuneRunning =
 +                        pme_load_balance(pme_loadbal, cr,
 +                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
 +                                         fplog,
 +                                         ir, state, cycles_pmes,
 +                                         fr->ic, fr->nbv, &fr->pmedata,
 +                                         step);
 +
 +                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
 +                    fr->ewaldcoeff = fr->ic->ewaldcoeff;
 +                    fr->rlist      = fr->ic->rlist;
 +                    fr->rlistlong  = fr->ic->rlistlong;
 +                    fr->rcoulomb   = fr->ic->rcoulomb;
 +                    fr->rvdw       = fr->ic->rvdw;
 +                }
 +                cycles_pmes = 0;
 +            }
 +        }
 +
 +        if (step_rel == wcycle_get_reset_counters(wcycle) ||
 +            gs.set[eglsRESETCOUNTERS] != 0)
 +        {
 +            /* Reset all the counters related to performance over the run */
 +            reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, runtime,
 +                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
 +            wcycle_set_reset_counters(wcycle, -1);
 +            if (!(cr->duty & DUTY_PME))
 +            {
 +                /* Tell our PME node to reset its counters */
 +                gmx_pme_send_resetcounters(cr, step);
 +            }
 +            /* Correct max_hours for the elapsed time */
 +            max_hours                -= run_time/(60.0*60.0);
 +            bResetCountersHalfMaxH    = FALSE;
 +            gs.set[eglsRESETCOUNTERS] = 0;
 +        }
 +
 +    }
 +    /* End of main MD loop */
 +    debug_gmx();
 +
 +    /* Stop the time */
 +    runtime_end(runtime);
 +
 +    if (bRerunMD && MASTER(cr))
 +    {
 +        close_trj(status);
 +    }
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Tell the PME only node to finish */
 +        gmx_pme_send_finish(cr);
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        if (ir->nstcalcenergy > 0 && !bRerunMD)
 +        {
 +            print_ebin(outf->fp_ene, FALSE, FALSE, FALSE, fplog, step, t,
 +                       eprAVER, FALSE, mdebin, fcd, groups, &(ir->opts));
 +        }
 +    }
 +
 +    done_mdoutf(outf);
 +
 +    debug_gmx();
 +
 +    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
 +    {
 +        fprintf(fplog, "Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n", nlh.s1/nlh.nns, sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
 +        fprintf(fplog, "Average number of atoms that crossed the half buffer length: %.1f\n\n", nlh.ab/nlh.nns);
 +    }
 +
 +    if (pme_loadbal != NULL)
 +    {
 +        pme_loadbal_done(pme_loadbal, cr, fplog,
 +                         fr->nbv != NULL && fr->nbv->bUseGPU);
 +    }
 +
 +    if (shellfc && fplog)
 +    {
 +        fprintf(fplog, "Fraction of iterations that converged:           %.2f %%\n",
 +                (nconverged*100.0)/step_rel);
 +        fprintf(fplog, "Average number of force evaluations per MD step: %.2f\n\n",
 +                tcount/step_rel);
 +    }
 +
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        print_replica_exchange_statistics(fplog, repl_ex);
 +    }
 +
 +    runtime->nsteps_done = step_rel;
 +
 +    return 0;
 +}